第一套完整解析

第一套完整解析

任务一:平台搭建与运维 1.配置hosts文件 vi /etc/hosts 并在hosts文件内填写 192.168.157.200 master 192.168.157.201 slave1 192.168.157.202 slave2 (可选)将配置完成的hosts传输到slave1和sla

任务一:平台搭建与运维

1.配置hosts文件

vi /etc/hosts

并在hosts文件内填写

192.168.157.200 master
192.168.157.201 slave1
192.168.157.202 slave2

(可选)将配置完成的hosts传输到slave1和slave2中

scp /etc/hosts slave1:/etc
scp /etc/hosts slave2:/etc

2.配置SSH免密连接

1.生成rsa密钥

ssh-keygen -t rsa

之后连续回车4次

2.将ssh密钥复制到其他节点

ssh-copy-id master
ssh-copy-id slave1
ssh-copy-id slave2

每次传输时需要输一次yes”和“密码

3.安装JDK

创建software文件夹

mkdir -p /root/software

上传Linux的压缩包

scp -r "D:\集训相关\环境配置\大数据平台组件\jdk-8u401-linux-x64.tar.gz" root@192.168.157.200:/root/software

解压JDK的压缩包

tar -zxvf /root/software/jdk-8u401-linux-x64.tar.gz -C /root/software

配置JDK环境变量

# 创建jdk.sh文件
vi /etc/profile.d/jdk.sh  

# 填入以下内容

export JAVA_HOME=/root/software/jdk1.8.0_401
export PATH=$PATH:$JAVA_HOME/bin
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib.tools.jar

# 保存并退出

source /etc/profile # 生效环境变量

查看JDK版本

java -version

(可选)查看JAVAC来判定JAVA变量是否已生效

javac

4.安装Hadoop

上传Hadoop的压缩包

scp -r "D:\集训相关\环境配置\大数据平台组件\hadoop-3.3.6.tar.gz" root@192.168.157.200:/root/software

解压Hadoop压缩包

tar -zxvf /root/software/hadoop-3.3.6.tar.gz -C /root/software

创建目录

mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/tempDatas
mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/namenodeDatas
mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/datanodeDatas
mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/dfs/nn/snn/edits
mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/dfs/snn/edits
mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/dfs/nn/name

配置hadoop-env.sh

export JAVA_HOME=/root/software/jdk1.8.0_401
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root

配置core-site.xml

# 删除原有的<configuration>追加以下代码

<configuration>
    <property>
        <name>fs.defaultFS</name>
        <value>hdfs://master:9000</value>
    </property>
</configuration>

配置hdfs-site.xml

# 删除原有的<configuration>追加以下代码

<configuration>
    <property>
        <name>dfs.replication</name>
        <value>3</value>
    </property>
    <property>
        <name>dfs.namenode.name.dir</name>
        <value>file:/root/software/hadoop-3.3.6/hadoopDatas/namenodeDatas</value>
    </property>
    <property>
        <name>dfs.datanode.data.dir</name>
        <value>file:/root/software/hadoop-3.3.6/hadoopDatas/datanodeDatas</value>
    </property>
</configuration>

配置mapred-site.xml

# 删除原有的<configuration>追加以下代码

<configuration>
    <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
    </property>
    <property>
        <name>yarn.app.mapreduce.am.env</name>
        <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
    </property>
    <property>
        <name>mapreduce.map.env</name>
        <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
    </property>
    <property>
        <name>mapreduce.reduce.env</name>
        <value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
    </property>
</configuration>

配置yarn-site.xml

# 删除原有的<configuration>追加以下代码

<configuration>
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>master</value>
    </property>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
    <property>
        <name>yarn.nodemanager.aux.services.mapreduce.shuffle.class</name>
        <value>org.apache.hadoop.mapred.ShuffleHandler</value>
    </property>
</configuration>

配置workers

删除原有的localhost添加以下代码

master
slave1
slave2

配置Hadoop环境变量

# 创建Hadoop.sh文件
vi /etc/profile.d/hadoop.sh

# 添加以下代码
export HADOOP_HOME=/root/software/hadoop-3.3.6
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin

# 保存并退出

source /etc/profile # 生效环境变量

传输hadoop和jdk的主文件和环境变量文件到别的节点

scp -r /root/software/hadoop-3.3.6/ slave1:/root/software
scp -r /root/software/hadoop-3.3.6/ slave2:/root/software
scp -r /etc/profile.d/jdk.sh slave1:/etc/profile.d
scp -r /etc/profile.d/jdk.sh slave2:/etc/profile.d
scp -r /etc/profile.d/hadoop.sh slave1:/etc/profile.d
scp -r /etc/profile.d/hadoop.sh slave2:/etc/profile.d

在slave1和slave2中生效

source /etc/profile

主节点格式化集群

hdfs namenode -format

启动HDFS

start-dfs.sh

启动YARN

start-yarn.sh

启动历史服务

mapred --daemon start historyserver

5.安装MySQL

上传MySQL包

scp -r "D:\集训相关\环境配置\关系型数据库\mysql-5.7.25-1.el7.x86_64.rpm-bundle.tar" root@192.168.157.200:/root/software

解压MySQL包

tar -zxvf /root/software/mysql-5.7.25-1.el7.x86_64.rpm-bundle.tar -C /root/software

安装rpm包

cd /root/software
rpm -ivh mysql-community-common.rpm
rpm -ivh mysql-community-libs.rpm
rpm -ivh mysql-community-libs-compat.rpm
rpm -ivh mysql-community-client.rpm
rpm -ivh mysql-community-server.rpm

(可选)提前安装依赖包

yum -y install net-tools numactl-libs libaio perl libncurses*

初始化MySQL

mysqld --initialize

获取密码

grep password /var/log/mysqld.log

获取权限

chown -R mysql:mysql /var/lib/mysql
chown -R mysql:mysql /var/lib/mysql

启动MySQL服务

nohup /usr/sbin/mysqld --defaluts-file=/etc/my.cnf --user=mysql &

登录MySQL

mysql -uroot -p
Enter Password: <输入你的密码后回车>

修改MySQL-root密码为123456

ALTER USER root@'localhost' IDENTIFIED BY '123456';

修改root用户的访问权限

use mysql
UPDATE user SET user.Host='%' WHERE user.user='root';

刷新root用户的权限

FLUSH PRIVILEGES;

(可选)查看root用户的权限

show grants for root@'%';
select user , hosts from mysql.user;

退出MySQL

exit

6.安装HIVE

上传HIVE包

scp -r "D:\集训相关\环境配置\大数据平台组件\apache-hive-3.1.2-bin.tar.gz" root@192.168.157.200:/root/software

解压hive包

tar -zxvf /root/software/apache-hive-3.1.2-bin.tar.gz -C /root/software

配置Hive环境变量

vi /etc/profile.d/hive.sh

export HIVE_HOME=/root/software/apache-hive-3.1.2-bin
export PATH=$PATH:$HIVE_HOME/bin

source /etc/profile

查看Hive版本

hive version

配置hive-env.sh

export HADOOP_HOME=/root/software/hadoop-3.3.6
export HIVE_CONF_DIR=/root/software/apache-hive-3.1.2-bin/conf
export HIVE_AUX_JARS_PATH=/root/software/apache-hive-3.1.2-bin/lib

解压mysql-connector包

tar -zxvf /root/software/mysql-connector-java-5.1.47.tar.gz -C /root/software

复制jar文件到/root/software/apache-hive-3.1.2-bin/lib中

cp /root/software/mysql-connector-java-5.1.47/mysql-connector-java-5.1.47-bin.jar /root/software/apache-hive-3.1.2-bin/lib

配置hive-site.xml文件

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

<configuration>
    <property>
        <name>javax.jdo.option.ConnectionURL</name>
        <value>jdbc:mysql://master:3306/hive?createDatabaseIfNotExist&amp;useSSL=false&amp;useUnicode=true&amp;characterEncoding=UTF8</value>
    </property>
    <property>
        <name>javax.jdo.option.ConnectionDrivers</name>
        <value></value>
    </property>
</configuration>

复制更新的guava文件

rm -rf /root/software/apache-hive-3.1.2-bin/lib/guava-19.0.jar 

cp /root/software/hadoop-3.3.6/share/hadoop/common/lib/guava-27.0-jre.jar /root/software/apache-hive-3.1.2-bin/lib/

删除log4j Jar包

rm /root/software/apache-hive-3.1.2-bin/lib/log4j-slf4j-impl-2.10.0.jar

初始化hive

schematool -dbType mysql -initSchema

使用CLI启动Hive

hive

创建student的表

create table student(id int,name String);

往student表中插入一条测试数据

INSERT INTO table student values (01,"test");

验证数据:

SELECT * FROM student;

7.安装FLUME

上传flume包

scp -r "D:\集训相关\环境配置\大数据平台组件\apache-flume-1.11.0-bin.tar.gz" root@192.168.157.200:/root/software

解压flume包

tar -zxvf /root/software/apache-flume-1.11.0-bin.tar.gz -C /root/software

配置Flume环境变量

vi /etc/profile.d/flume.sh

export FLUME_HOME=/root/software/apache-flume-1.11.0-bin
export PATH=$PATH:$FLUME_HOME/bin

# 保存并退出

source /etc/profile # 生效环境变量

进入/root/software/apache/flume-1.11.0-bin/conf目录下

cd /root/software/apache/flume-1.11.0-bin/conf

复制将flume-env.sh.template

cp flume-env.sh.template flume-env.sh

打开flume-env.sh

JAVA_HOME=/root/software/jdk-8u401-linux

查看Flume版本

flume-ng version

任务二:HIVE数据库配置维护

在Hive中创建一个名为comm的数据库 (要求:如果数据库已经存在,则不进行创建。)

create database IF NOT EXISTS comm;

进入comm库

use comm;

创建一个名为ods_behavior_log的外部表

CREATE EXTERNAL TABLE IF NOT EXISTS ods_behavior_log (line STRING)
PARTITIONED BY (dt STRING)
STORED AS TEXTFILE
LOCATION '/behavior/ods/ods_behavior_log';

加载数据到分区中

LOAD DATA LOCAL INPATH '/root/eduhq/data/app_log/behavior/behavior2023-01-01.log' INTO TABLE ods_behavior_log PARTITION (dt='2023-01-01');

查看所有现有分区:

SHOW PARTITIONS ods_behavior_log;

查看前 3 行数据:

SELECT * FROM ods_behavior_log LIMIT 3;

统计表的数据总行数:

SELECT COUNT(*) FROM ods_behavior_log;

创建 dwd_behavior_log 表

CREATE EXTERNAL TABLE IF NOT EXISTS dwd_behavior_log (
    client_ip STRING,
    device_type STRING,
    type STRING,
    device STRING,
    url STRING,
    province STRING,
    city STRING,
    ts BIGINT
)
PARTITIONED BY (dt STRING)
STORED AS ORC
LOCATION '/behavior/dwd/dwd_behavior_log'
TBLPROPERTIES ('orc.compress'='SNAPPY');

任务三:数据获取与处理

使用 HDFS Shell 指令创建目录

hadoop fs -mkdir -p /behavior/origin_log

将本地目录中的文件上传到 HDFS

hadoop fs -put /root/eduhq/data/app_log/behavior/* /behavior/origin_log/

访问 HDFS Web UI 界面

http://192.168.157.200:9870

任务四:数据清洗

时间日期格式进行分列公式如下

=TEXT(($D2/86400000)+DATE(1970,1,1),"yyyy-mm-dd") # 年月日格式
=TEXT(($D2/86400000)+DATE(1970,1,1),"hh:mm:ss") # 时分秒格式

任务五:数据标注

文件中随机获取“省份”和“城市”信息

具体步骤如下

1.创建一个JAVA类

  • 1.打开IDEA,创建一个新项目,选择Maven项目,项目名字随便取,点击“创建”

  • 2.写入pom.xml文件内容,内容如下:

    <project xmlns=“http://maven.apache.org/POM/4.0.0”

         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>org.example</groupId>
    <artifactId>b</artifactId>
    <version>1.0-SNAPSHOT</version>
    <packaging>jar</packaging>
    
    <repositories>
        <repository>
            <id>central</id>
            <url>https://repo.maven.apache.org/maven2</url>
        </repository>
        <repository>
            <id>apache-releases</id>
            <url>https://repository.apache.org/content/repositories/releases/</url>
        </repository>
        <repository>
            <id>apache-snapshots</id>
            <url>https://repository.apache.org/content/repositories/snapshots/</url>
            <releases>
                <enabled>false</enabled>
            </releases>
            <snapshots>
                <enabled>true</enabled>
            </snapshots>
        </repository>
    </repositories>
    
    <dependencies>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>3.1.2</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-metastore</artifactId>
            <version>3.1.2</version>
            <scope>provided</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-serde</artifactId>
            <version>3.1.2</version>
            <scope>provided</scope>
        </dependency>
    </dependencies>
    
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.8.1</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
        </plugins>
    </build>
    

  • 3.在src\main\java目录下创建一个名为IpToLocUdf.java的类,内容如下:

    package Bigdata;

    import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
    import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
    import org.apache.hadoop.hive.ql.metadata.HiveException;
    import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
    import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
    import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
    import java.io.FileReader;
    import java.io.IOException;
    import java.util.ArrayList;
    import java.util.Scanner;

    public class IpToLocUdf extends GenericUDF {

    public StringObjectInspector ipString;
    public ArrayList<String> dataList = new ArrayList<>();
    
    
    @Override
    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        // 用于初始化UDF,检查传入参数的类型和数量。
        if (arguments.length != 1) {
            throw new UDFArgumentLengthException("arguments only takes 1 arguments: String");
        }
        // 传入的参数必须是string
        ObjectInspector first = arguments[0];
    
        if (!(first instanceof StringObjectInspector)) {
            throw new UDFArgumentException("first argument  must be a string");
        }
        this.ipString = (StringObjectInspector) first;
        //从指定文件中读取地区信息
        String FilePath = "/root/eduhq/data/area.json";
        try {
            Scanner sc = new Scanner(new InputStreamReader(new FileInputStream(FilePath), "UTF-8"));
            while (sc.hasNext()) {
                String lineData = sc.nextLine();
                this.dataList.add(lineData);
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
    }
    
    @Override
    public Object evaluate(DeferredObject[] arguments) throws HiveException {
        //获取传入的携带IP地址字符串参数
        String ip = ipString.getPrimitiveJavaObject(arguments[0].get());
        //使用ip里面最小的数字当做随机索引
        String[] parts = ip.split("\\.");
        int min = Integer.parseInt(parts[0]);
        for (int i = 1; i < parts.length; i++) {
            int value = Integer.parseInt(parts[i]);
            if (value < min) {
                min = value;
            }
        }
        //返回随机一条的city和province
        String s = dataList.get(min); //{"city":"绥化","province":"黑龙江"}
        //暴力切分
        String[] arr = s.replaceAll("\"", "").split(",");
        String city = arr[0].split(":")[1];
        String province = arr[1].split(":")[1].replace("}","");
        return city + "," + province;
    }
    
    @Override
    public String getDisplayString(String[] children) {
        return "_FUNC_(" + children[0] + ")";
    }
    

    }

  • 3.再在src\main\java目录下创建一个名为UrlTransUdf.java的类,内容如下:

    package Bigdata;

    import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
    import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
    import org.apache.hadoop.hive.ql.metadata.HiveException;
    import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
    import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
    import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
    import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;

    public class UrlTransUdf extends GenericUDF {

    public StringObjectInspector URL;
    
    @Override
    public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
        // 用于初始化UDF,检查传入参数的类型和数量。
        if (arguments.length != 1) {
            throw new UDFArgumentLengthException("arguments only takes 1 arguments: String");
        }
        // 传入的参数必须是string
        ObjectInspector first = arguments[0];
    
        if (!(first instanceof StringObjectInspector)) {
            throw new UDFArgumentException("first argument  must be a string");
        }
        this.URL = (StringObjectInspector) first;
    
        return PrimitiveObjectInspectorFactory.javaStringObjectInspector;
    }
    
    @Override
    public Object evaluate(DeferredObject[] arguments) throws HiveException {
        //获取传入的URL
        String inPut = URL.getPrimitiveJavaObject(arguments[0].get());
    
        return convertHttpsToHttp(inPut);
    }
    
    public static String convertHttpsToHttp(String input) {
        // 正则表达式匹配 HTTPS URL
        String regex = "https://";
        // 替换 HTTPS 为 HTTP
        String replacement = "http://";
        return input.replaceAll(regex, replacement);
    }
    
    @Override
    public String getDisplayString(String[] children) {
        return "_FUNC_(" + children[0] + ")";
    }
    

    }

  • 4.在src目录下打开CMD,执行命令:

    mvn clean package

  • 5.编译成功后,在target目录下找到jar包

  • 6.创建mkdir /root/eduhq/udf_jars目录

    mkdir -p /root/eduhq/udf_jars

  • 7.创建/root/eduhq/udf_jars目录

    hdfs dfs -mkdir -p /hive/udf_jars

  • 8.将jar包上传到HDFS

    hdfs dfs -put /root/eduhq/udf_jars/hive-udf-behavior-1.0.0.jar /hive/udf_jars

  • 9.在hive中添加HDFS上面的jar包

    add Jar hdfs:///hive/udf_jars/hive-udf-behavior-1.0.0.jar;

  • 10.创建永久函数

    create function get_city_by_ip AS 'Bigdata.IpToLocUdf';

    create function url_trans AS 'Bigdata.UrlTransUdf';

  • 11.测试函数

    select get_city_by_ip(“192.168.45.25”);

    select url_trans(“https://baidu.com”);

  • 12.将 Hive 设置为非严格模式

    set hive.exec.dynamic.partition.mode=nostrict;

  • 13.将area.json文件传至slave1和slave2节点

    scp /root/eduhq/data/area.json root@slave1:/root/eduhq/data/

    scp /root/eduhq/data/area.json root@slave2:/root/eduhq/data/

  • 14.创建分区表

    insert overwrite table comm.dwd_behavior_log partition(dt)
    SELECT

       jt.client_ip,
       jt.device_type,
       jt.type,
       jt.device,
      url_trans(jt.url) as url ,
      split(get_city_by_ip(jt.client_ip),",")[1] as province,
      split(get_city_by_ip(jt.client_ip),",")[0] as city,
      jt.ts,
      dt
    

    FROM comm.ods_behavior_log
    LATERAL VIEW json_tuple(line, 'client_ip', 'device_type', 'type', 'device', 'url', 'time') jt AS client_ip, device_type, type, device, url, ts;

  • 15.查看dwd_behavior_log表的所有现有分区

    show partitions comm.dwd_behavior_log;

  • 查看外部表dwd_behavior_log的前3行数据,并验证URL协议是否被统一为“http”,以及通过IP是否能够获取到“省份”和“城市”信息;

    select * from comm.dwd_behavior_log limit 3;

    – 验证URL协议是否被统一为“http”
    select count(*) from comm.dwd_behavior_log where url like “%https%“;

    – 结果为0则验证成功

    – 通过IP是否能够获取到“省份”和“城市”信息
    select count(*) from comm.dwd_behavior_log where city is null or province is null;

    – 结果为0则验证成功

  • 16.查看dwd_behavior_log表的数据总行数

    select count(*) from comm.dwd_behavior_log;

    – 结果为140000

任务六:数据分析

在 comm 数 据 库 下 创 建 一 个 名 为 dws_behavior_log的外部表,如果表已存在,则先删除

drop table if exists comm.dws_behavior_log;


create external table comm.dws_behavior_log (
client_ip string,
device_type string,
type string,
device string,
url string,
province string,
city string
)
partitioned by (dt string)
stored as orc
location '/behavior/dws/dws_behavior_log'
tblproperties ('orc.compress'='SNAPPY');

启动Hive的动态分区功能,并将Hive设置为非严格模式

set hive.exec.dynamic.partition.mode=nostrict;

使用insert overwrite … select …子句将dwd_behavior_log表中数据插入分区表dws_behavior_log中,并实现根据dt进行动态分区

insert overwrite table comm.dws_behavior_log partition(dt)
select
 client_ip,device_type,type,device,url,province, city, dt
from comm.dwd_behavior_log;

查看dws_behavior_log表的所有现有分区、前3行数据,并统计统计表数据总行数

show partitions comm.dws_behavior_log;

-- 前3行数据
select * from comm.dws_behavior_log limit 3;

-- 统计表数据总行数
select count(*) from comm.dws_behavior_log;

--总数为140000

在comm数据库下创建一个名为dim_date的外部表,如果表已存在,则先删除

drop table if exists comm.dim_date;


create external table comm.dim_date (
date_id string,
week_id string,
week_day string,
day string,
month string,
quarter string,
year string,
is_workday string,
holiday_id string
)
row format delimited fields terminated by '\t'
location '/behavior/dim/dim_date'
tblproperties('skip.header.line.count'='1');

在comm数据库下创建一个名为dim_area的外部表,如果表已存在,则先删除

-- 如果表已存在,则先删除
drop table if exists comm.dim_area;


-- 创 建 一 个 名 为dim_area的外部表
create external table comm.dim_area (
city string,
province string,
are string
)
row format delimited fields terminated by '\t'
location '/behavior/dim/dim_area';

提前将dim_date_2023.txt和dim_area.txt文件上传至本地/root/eduhq/data目录下

使用load data子句将本地/root/eduhq/data目录下的“dim_date_2023.txt”和“dim_area.txt”文件分别加载到外部表dim_date和dim_area中

load data local inpath '/root/eduhq/data/dim_date_2023.txt' into table comm.dim_date;


load data local inpath '/root/eduhq/data/dim_area.txt' into table comm.dim_area;

分别查看外部表dim_date和dim_area的前3行数据

select * from comm.dim_date limit 3;


select * from comm.dim_area limit 3;

分别统计外部表dim_date和dim_area数据总行数

select count(*) from comm.dim_date;
-- 结果为365

select count(*) from comm.dim_area;
-- 结果为829

统计不同省份用户访问量;将统计结果导出到本地文件系统的/root/eduhq/result/ads_user_pro目录下,并指定列的分隔符为逗号

-- 第一步将查询结果写入临时表
create table default.ads_user_pro
row format delimited fields terminated by ','
as
select
province,
count(*) as cnt
from comm.dwd_behavior_log
group by province;


-- 第二步创建本地目标目录
mkdir -p /root/eduhq/result/ads_user_pro


-- 第三步将hdfs上面结果文件导入到本地
hdfs dfs -get /user/hive/warehouse/ads_user_pro/* /root/eduhq/result/ads_user_pro/

统计不同时间段的网页浏览量将统计结果导出到 本 地 文 件 系 统 的 /root/eduhq/result/ads_user_hour 目录下,并指定列的分隔符为逗号;

-- 第一步将查询结果写入临时表
create table default.ads_user_hour
row format delimited fields terminated by ','
as
select
hour(from_utc_timestamp(ts,"yyyy-MM-dd HH:mm:ss")) as diff_hour,
count(*) as cnt
from comm.dwd_behavior_log
group by hour(from_utc_timestamp(ts,"yyyy-MM-dd HH:mm:ss"));


-- 第二步创建本地目标目录
mkdir -p /root/eduhq/result/ads_user_hour


-- 第三步将hdfs上面结果文件导入到本地
hdfs dfs -get /user/hive/warehouse/ads_user_hour/* /root/eduhq/result/ads_user_hour/

不同网站访客的设备类型统计;将统计结果导出到本地文件系统的/root/eduhq/result/ads_visit_mode目录下,并指定列的分隔符为逗号;

-- 第一步将查询结果写入临时表
create table default.ads_visit_mode
row format delimited fields terminated by ','
as
select
url,
count(distinct device_type) as cnt
from comm.dwd_behavior_log
group by url ;


-- 第二步创建本地目标目录
mkdir -p /root/eduhq/result/ads_visit_mode


-- 第三步将hdfs上面结果文件导入到本地
hdfs dfs -get /user/hive/warehouse/ads_visit_mode/* /root/eduhq/result/ads_visit_mode/

不同网站的上网模式统计;将统计结果导出到本地 文 件 系 统 的 /root/eduhq/result/ads_online_type 目 录下,并指定列的分隔符为逗号;

-- 第一步将查询结果写入临时表
create table default.ads_online_type
row format delimited fields terminated by ','
as
select
url,
count(distinct type) as cnt
from comm.dwd_behavior_log
group by url ;


-- 第二步创建本地目标目录
mkdir -p /root/eduhq/result/ads_online_type


-- 第三步将hdfs上面结果文件导入到本地
hdfs dfs -get /user/hive/warehouse/ads_online_type/* /root/eduhq/result/ads_online_type/

任务七:业务分析

统计每天不同经济大区用户访问量

create table ads_user_region
row format delimited fields terminated by ','
as
SELECT
    da.are AS are,
    COUNT(l.province) AS num
FROM
    dim_area da
LEFT JOIN
    comm.dwd_behavior_log l
ON
    da.province = l.province
GROUP BY
    da.are;


mkdir -p /root/eduhq/result/ads_user_region


hdfs dfs -get /user/hive/warehouse/comm.db/ads_user_region/* /root/eduhq/result/ads_user_region

统计节假日和工作日的浏览量差异

-- 由于Xterminal无法输入中文所以使用英文表示,workday是指工作日,holiday是指节假日

create table ads_hol_work_user
row format delimited fields terminated by ','
as
SELECT
    CASE
        WHEN dd.is_workday = '0' THEN 'holiday'
        ELSE 'workday'
    END AS is_workday,
    COUNT(l.province) AS num
FROM
    dws_behavior_log l
JOIN
    dim_date dd
ON
    l.dt = dd.date_id
GROUP BY
    dd.is_workday;


mkdir -p /root/eduhq/result/ads_hol_work_user


hdfs dfs -get /user/hive/warehouse/comm.db/ads_hol_work_user/* /root/eduhq/result/ads_hol_work_user

统计不同域名的用户访问量

create table ads_user_domian
row format delimited fields terminated by ','
as
select
    url,
    count(device)
from dws_behavior_log
group by url;


mkdir -p /root/eduhq/result/ads_user_domian


hdfs dfs -get /user/hive/warehouse/comm.db/ads_user_domian/* /root/eduhq/result/ads_user_domian
LICENSED UNDER CC BY-NC-SA 4.0