任务一:平台搭建与运维
1.配置hosts文件
vi /etc/hosts
并在hosts文件内填写
192.168.157.200 master
192.168.157.201 slave1
192.168.157.202 slave2
(可选)将配置完成的hosts传输到slave1和slave2中
scp /etc/hosts slave1:/etc
scp /etc/hosts slave2:/etc
2.配置SSH免密连接
1.生成rsa密钥
ssh-keygen -t rsa
之后连续回车4次
2.将ssh密钥复制到其他节点
ssh-copy-id master
ssh-copy-id slave1
ssh-copy-id slave2
每次传输时需要输一次yes”和“密码”
3.安装JDK
创建software文件夹
mkdir -p /root/software
上传Linux的压缩包
scp -r "D:\集训相关\环境配置\大数据平台组件\jdk-8u401-linux-x64.tar.gz" root@192.168.157.200:/root/software
解压JDK的压缩包
tar -zxvf /root/software/jdk-8u401-linux-x64.tar.gz -C /root/software
配置JDK环境变量
# 创建jdk.sh文件
vi /etc/profile.d/jdk.sh
# 填入以下内容
export JAVA_HOME=/root/software/jdk1.8.0_401
export PATH=$PATH:$JAVA_HOME/bin
export CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib.tools.jar
# 保存并退出
source /etc/profile # 生效环境变量
查看JDK版本
java -version
(可选)查看JAVAC来判定JAVA变量是否已生效
javac
4.安装Hadoop
上传Hadoop的压缩包
scp -r "D:\集训相关\环境配置\大数据平台组件\hadoop-3.3.6.tar.gz" root@192.168.157.200:/root/software
解压Hadoop压缩包
tar -zxvf /root/software/hadoop-3.3.6.tar.gz -C /root/software
创建目录
mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/tempDatas
mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/namenodeDatas
mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/datanodeDatas
mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/dfs/nn/snn/edits
mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/dfs/snn/edits
mkdir -p /root/software/hadoop-3.3.6/hadoopDatas/dfs/nn/name
配置hadoop-env.sh
export JAVA_HOME=/root/software/jdk1.8.0_401
export HDFS_NAMENODE_USER=root
export HDFS_DATANODE_USER=root
export HDFS_SECONDARYNAMENODE_USER=root
export YARN_RESOURCEMANAGER_USER=root
export YARN_NODEMANAGER_USER=root
配置core-site.xml
# 删除原有的<configuration>追加以下代码
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://master:9000</value>
</property>
</configuration>
配置hdfs-site.xml
# 删除原有的<configuration>追加以下代码
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/root/software/hadoop-3.3.6/hadoopDatas/namenodeDatas</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/root/software/hadoop-3.3.6/hadoopDatas/datanodeDatas</value>
</property>
</configuration>
配置mapred-site.xml
# 删除原有的<configuration>追加以下代码
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>yarn.app.mapreduce.am.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.map.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
<property>
<name>mapreduce.reduce.env</name>
<value>HADOOP_MAPRED_HOME=${HADOOP_HOME}</value>
</property>
</configuration>
配置yarn-site.xml
# 删除原有的<configuration>追加以下代码
<configuration>
<property>
<name>yarn.resourcemanager.hostname</name>
<value>master</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux.services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
</configuration>
配置workers
删除原有的localhost添加以下代码
master
slave1
slave2
配置Hadoop环境变量
# 创建Hadoop.sh文件
vi /etc/profile.d/hadoop.sh
# 添加以下代码
export HADOOP_HOME=/root/software/hadoop-3.3.6
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
# 保存并退出
source /etc/profile # 生效环境变量
传输hadoop和jdk的主文件和环境变量文件到别的节点
scp -r /root/software/hadoop-3.3.6/ slave1:/root/software
scp -r /root/software/hadoop-3.3.6/ slave2:/root/software
scp -r /etc/profile.d/jdk.sh slave1:/etc/profile.d
scp -r /etc/profile.d/jdk.sh slave2:/etc/profile.d
scp -r /etc/profile.d/hadoop.sh slave1:/etc/profile.d
scp -r /etc/profile.d/hadoop.sh slave2:/etc/profile.d
在slave1和slave2中生效
source /etc/profile
主节点格式化集群
hdfs namenode -format
启动HDFS
start-dfs.sh
启动YARN
start-yarn.sh
启动历史服务
mapred --daemon start historyserver
5.安装MySQL
上传MySQL包
scp -r "D:\集训相关\环境配置\关系型数据库\mysql-5.7.25-1.el7.x86_64.rpm-bundle.tar" root@192.168.157.200:/root/software
解压MySQL包
tar -zxvf /root/software/mysql-5.7.25-1.el7.x86_64.rpm-bundle.tar -C /root/software
安装rpm包
cd /root/software
rpm -ivh mysql-community-common.rpm
rpm -ivh mysql-community-libs.rpm
rpm -ivh mysql-community-libs-compat.rpm
rpm -ivh mysql-community-client.rpm
rpm -ivh mysql-community-server.rpm
(可选)提前安装依赖包
yum -y install net-tools numactl-libs libaio perl libncurses*
初始化MySQL
mysqld --initialize
获取密码
grep password /var/log/mysqld.log
获取权限
chown -R mysql:mysql /var/lib/mysql
chown -R mysql:mysql /var/lib/mysql
启动MySQL服务
nohup /usr/sbin/mysqld --defaluts-file=/etc/my.cnf --user=mysql &
登录MySQL
mysql -uroot -p
Enter Password: <输入你的密码后回车>
修改MySQL-root密码为123456
ALTER USER root@'localhost' IDENTIFIED BY '123456';
修改root用户的访问权限
use mysql
UPDATE user SET user.Host='%' WHERE user.user='root';
刷新root用户的权限
FLUSH PRIVILEGES;
(可选)查看root用户的权限
show grants for root@'%';
select user , hosts from mysql.user;
退出MySQL
exit
6.安装HIVE
上传HIVE包
scp -r "D:\集训相关\环境配置\大数据平台组件\apache-hive-3.1.2-bin.tar.gz" root@192.168.157.200:/root/software
解压hive包
tar -zxvf /root/software/apache-hive-3.1.2-bin.tar.gz -C /root/software
配置Hive环境变量
vi /etc/profile.d/hive.sh
export HIVE_HOME=/root/software/apache-hive-3.1.2-bin
export PATH=$PATH:$HIVE_HOME/bin
source /etc/profile
查看Hive版本
hive version
配置hive-env.sh
export HADOOP_HOME=/root/software/hadoop-3.3.6
export HIVE_CONF_DIR=/root/software/apache-hive-3.1.2-bin/conf
export HIVE_AUX_JARS_PATH=/root/software/apache-hive-3.1.2-bin/lib
解压mysql-connector包
tar -zxvf /root/software/mysql-connector-java-5.1.47.tar.gz -C /root/software
复制jar文件到/root/software/apache-hive-3.1.2-bin/lib中
cp /root/software/mysql-connector-java-5.1.47/mysql-connector-java-5.1.47-bin.jar /root/software/apache-hive-3.1.2-bin/lib
配置hive-site.xml文件
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:mysql://master:3306/hive?createDatabaseIfNotExist&useSSL=false&useUnicode=true&characterEncoding=UTF8</value>
</property>
<property>
<name>javax.jdo.option.ConnectionDrivers</name>
<value></value>
</property>
</configuration>
复制更新的guava文件
rm -rf /root/software/apache-hive-3.1.2-bin/lib/guava-19.0.jar
cp /root/software/hadoop-3.3.6/share/hadoop/common/lib/guava-27.0-jre.jar /root/software/apache-hive-3.1.2-bin/lib/
删除log4j Jar包
rm /root/software/apache-hive-3.1.2-bin/lib/log4j-slf4j-impl-2.10.0.jar
初始化hive
schematool -dbType mysql -initSchema
使用CLI启动Hive
hive
创建student的表
create table student(id int,name String);
往student表中插入一条测试数据
INSERT INTO table student values (01,"test");
验证数据:
SELECT * FROM student;
7.安装FLUME
上传flume包
scp -r "D:\集训相关\环境配置\大数据平台组件\apache-flume-1.11.0-bin.tar.gz" root@192.168.157.200:/root/software
解压flume包
tar -zxvf /root/software/apache-flume-1.11.0-bin.tar.gz -C /root/software
配置Flume环境变量
vi /etc/profile.d/flume.sh
export FLUME_HOME=/root/software/apache-flume-1.11.0-bin
export PATH=$PATH:$FLUME_HOME/bin
# 保存并退出
source /etc/profile # 生效环境变量
进入/root/software/apache/flume-1.11.0-bin/conf目录下
cd /root/software/apache/flume-1.11.0-bin/conf
复制将flume-env.sh.template
cp flume-env.sh.template flume-env.sh
打开flume-env.sh
JAVA_HOME=/root/software/jdk-8u401-linux
查看Flume版本
flume-ng version
任务二:HIVE数据库配置维护
在Hive中创建一个名为comm的数据库 (要求:如果数据库已经存在,则不进行创建。)
create database IF NOT EXISTS comm;
进入comm库
use comm;
创建一个名为ods_behavior_log的外部表
CREATE EXTERNAL TABLE IF NOT EXISTS ods_behavior_log (line STRING)
PARTITIONED BY (dt STRING)
STORED AS TEXTFILE
LOCATION '/behavior/ods/ods_behavior_log';
加载数据到分区中
LOAD DATA LOCAL INPATH '/root/eduhq/data/app_log/behavior/behavior2023-01-01.log' INTO TABLE ods_behavior_log PARTITION (dt='2023-01-01');
查看所有现有分区:
SHOW PARTITIONS ods_behavior_log;
查看前 3 行数据:
SELECT * FROM ods_behavior_log LIMIT 3;
统计表的数据总行数:
SELECT COUNT(*) FROM ods_behavior_log;
创建 dwd_behavior_log 表
CREATE EXTERNAL TABLE IF NOT EXISTS dwd_behavior_log (
client_ip STRING,
device_type STRING,
type STRING,
device STRING,
url STRING,
province STRING,
city STRING,
ts BIGINT
)
PARTITIONED BY (dt STRING)
STORED AS ORC
LOCATION '/behavior/dwd/dwd_behavior_log'
TBLPROPERTIES ('orc.compress'='SNAPPY');
任务三:数据获取与处理
使用 HDFS Shell 指令创建目录
hadoop fs -mkdir -p /behavior/origin_log
将本地目录中的文件上传到 HDFS
hadoop fs -put /root/eduhq/data/app_log/behavior/* /behavior/origin_log/
访问 HDFS Web UI 界面
http://192.168.157.200:9870
任务四:数据清洗
时间日期格式进行分列公式如下
=TEXT(($D2/86400000)+DATE(1970,1,1),"yyyy-mm-dd") # 年月日格式
=TEXT(($D2/86400000)+DATE(1970,1,1),"hh:mm:ss") # 时分秒格式
任务五:数据标注
文件中随机获取“省份”和“城市”信息
具体步骤如下
1.创建一个JAVA类
1.打开IDEA,创建一个新项目,选择Maven项目,项目名字随便取,点击“创建”
2.写入pom.xml文件内容,内容如下:
<project xmlns=“http://maven.apache.org/POM/4.0.0”
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>org.example</groupId> <artifactId>b</artifactId> <version>1.0-SNAPSHOT</version> <packaging>jar</packaging> <repositories> <repository> <id>central</id> <url>https://repo.maven.apache.org/maven2</url> </repository> <repository> <id>apache-releases</id> <url>https://repository.apache.org/content/repositories/releases/</url> </repository> <repository> <id>apache-snapshots</id> <url>https://repository.apache.org/content/repositories/snapshots/</url> <releases> <enabled>false</enabled> </releases> <snapshots> <enabled>true</enabled> </snapshots> </repository> </repositories> <dependencies> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-exec</artifactId> <version>3.1.2</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-metastore</artifactId> <version>3.1.2</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.hive</groupId> <artifactId>hive-serde</artifactId> <version>3.1.2</version> <scope>provided</scope> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.8.1</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> </plugins> </build>
3.在src\main\java目录下创建一个名为IpToLocUdf.java的类,内容如下:
package Bigdata;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Scanner;public class IpToLocUdf extends GenericUDF {
public StringObjectInspector ipString; public ArrayList<String> dataList = new ArrayList<>(); @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { // 用于初始化UDF,检查传入参数的类型和数量。 if (arguments.length != 1) { throw new UDFArgumentLengthException("arguments only takes 1 arguments: String"); } // 传入的参数必须是string ObjectInspector first = arguments[0]; if (!(first instanceof StringObjectInspector)) { throw new UDFArgumentException("first argument must be a string"); } this.ipString = (StringObjectInspector) first; //从指定文件中读取地区信息 String FilePath = "/root/eduhq/data/area.json"; try { Scanner sc = new Scanner(new InputStreamReader(new FileInputStream(FilePath), "UTF-8")); while (sc.hasNext()) { String lineData = sc.nextLine(); this.dataList.add(lineData); } } catch (IOException e) { throw new RuntimeException(e); } return PrimitiveObjectInspectorFactory.javaStringObjectInspector; } @Override public Object evaluate(DeferredObject[] arguments) throws HiveException { //获取传入的携带IP地址字符串参数 String ip = ipString.getPrimitiveJavaObject(arguments[0].get()); //使用ip里面最小的数字当做随机索引 String[] parts = ip.split("\\."); int min = Integer.parseInt(parts[0]); for (int i = 1; i < parts.length; i++) { int value = Integer.parseInt(parts[i]); if (value < min) { min = value; } } //返回随机一条的city和province String s = dataList.get(min); //{"city":"绥化","province":"黑龙江"} //暴力切分 String[] arr = s.replaceAll("\"", "").split(","); String city = arr[0].split(":")[1]; String province = arr[1].split(":")[1].replace("}",""); return city + "," + province; } @Override public String getDisplayString(String[] children) { return "_FUNC_(" + children[0] + ")"; }
}
3.再在src\main\java目录下创建一个名为UrlTransUdf.java的类,内容如下:
package Bigdata;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentLengthException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;public class UrlTransUdf extends GenericUDF {
public StringObjectInspector URL; @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { // 用于初始化UDF,检查传入参数的类型和数量。 if (arguments.length != 1) { throw new UDFArgumentLengthException("arguments only takes 1 arguments: String"); } // 传入的参数必须是string ObjectInspector first = arguments[0]; if (!(first instanceof StringObjectInspector)) { throw new UDFArgumentException("first argument must be a string"); } this.URL = (StringObjectInspector) first; return PrimitiveObjectInspectorFactory.javaStringObjectInspector; } @Override public Object evaluate(DeferredObject[] arguments) throws HiveException { //获取传入的URL String inPut = URL.getPrimitiveJavaObject(arguments[0].get()); return convertHttpsToHttp(inPut); } public static String convertHttpsToHttp(String input) { // 正则表达式匹配 HTTPS URL String regex = "https://"; // 替换 HTTPS 为 HTTP String replacement = "http://"; return input.replaceAll(regex, replacement); } @Override public String getDisplayString(String[] children) { return "_FUNC_(" + children[0] + ")"; }
}
4.在src目录下打开CMD,执行命令:
mvn clean package
5.编译成功后,在target目录下找到jar包
6.创建mkdir /root/eduhq/udf_jars目录
mkdir -p /root/eduhq/udf_jars
7.创建/root/eduhq/udf_jars目录
hdfs dfs -mkdir -p /hive/udf_jars
8.将jar包上传到HDFS
hdfs dfs -put /root/eduhq/udf_jars/hive-udf-behavior-1.0.0.jar /hive/udf_jars
9.在hive中添加HDFS上面的jar包
add Jar hdfs:///hive/udf_jars/hive-udf-behavior-1.0.0.jar;
10.创建永久函数
create function get_city_by_ip AS 'Bigdata.IpToLocUdf';
create function url_trans AS 'Bigdata.UrlTransUdf';
11.测试函数
select get_city_by_ip(“192.168.45.25”);
select url_trans(“https://baidu.com”);
12.将 Hive 设置为非严格模式
set hive.exec.dynamic.partition.mode=nostrict;
13.将area.json文件传至slave1和slave2节点
scp /root/eduhq/data/area.json root@slave1:/root/eduhq/data/
scp /root/eduhq/data/area.json root@slave2:/root/eduhq/data/
14.创建分区表
insert overwrite table comm.dwd_behavior_log partition(dt)
SELECTjt.client_ip, jt.device_type, jt.type, jt.device, url_trans(jt.url) as url , split(get_city_by_ip(jt.client_ip),",")[1] as province, split(get_city_by_ip(jt.client_ip),",")[0] as city, jt.ts, dt
FROM comm.ods_behavior_log
LATERAL VIEW json_tuple(line, 'client_ip', 'device_type', 'type', 'device', 'url', 'time') jt AS client_ip, device_type, type, device, url, ts;15.查看dwd_behavior_log表的所有现有分区
show partitions comm.dwd_behavior_log;
查看外部表dwd_behavior_log的前3行数据,并验证URL协议是否被统一为“http”,以及通过IP是否能够获取到“省份”和“城市”信息;
select * from comm.dwd_behavior_log limit 3;
– 验证URL协议是否被统一为“http”
select count(*) from comm.dwd_behavior_log where url like “%https%“;– 结果为0则验证成功
– 通过IP是否能够获取到“省份”和“城市”信息
select count(*) from comm.dwd_behavior_log where city is null or province is null;– 结果为0则验证成功
16.查看dwd_behavior_log表的数据总行数
select count(*) from comm.dwd_behavior_log;
– 结果为140000
任务六:数据分析
在 comm 数 据 库 下 创 建 一 个 名 为 dws_behavior_log的外部表,如果表已存在,则先删除
drop table if exists comm.dws_behavior_log;
create external table comm.dws_behavior_log (
client_ip string,
device_type string,
type string,
device string,
url string,
province string,
city string
)
partitioned by (dt string)
stored as orc
location '/behavior/dws/dws_behavior_log'
tblproperties ('orc.compress'='SNAPPY');
启动Hive的动态分区功能,并将Hive设置为非严格模式
set hive.exec.dynamic.partition.mode=nostrict;
使用insert overwrite … select …子句将dwd_behavior_log表中数据插入分区表dws_behavior_log中,并实现根据dt进行动态分区
insert overwrite table comm.dws_behavior_log partition(dt)
select
client_ip,device_type,type,device,url,province, city, dt
from comm.dwd_behavior_log;
查看dws_behavior_log表的所有现有分区、前3行数据,并统计统计表数据总行数
show partitions comm.dws_behavior_log;
-- 前3行数据
select * from comm.dws_behavior_log limit 3;
-- 统计表数据总行数
select count(*) from comm.dws_behavior_log;
--总数为140000
在comm数据库下创建一个名为dim_date的外部表,如果表已存在,则先删除
drop table if exists comm.dim_date;
create external table comm.dim_date (
date_id string,
week_id string,
week_day string,
day string,
month string,
quarter string,
year string,
is_workday string,
holiday_id string
)
row format delimited fields terminated by '\t'
location '/behavior/dim/dim_date'
tblproperties('skip.header.line.count'='1');
在comm数据库下创建一个名为dim_area的外部表,如果表已存在,则先删除
-- 如果表已存在,则先删除
drop table if exists comm.dim_area;
-- 创 建 一 个 名 为dim_area的外部表
create external table comm.dim_area (
city string,
province string,
are string
)
row format delimited fields terminated by '\t'
location '/behavior/dim/dim_area';
提前将dim_date_2023.txt和dim_area.txt文件上传至本地/root/eduhq/data目录下
使用load data子句将本地/root/eduhq/data目录下的“dim_date_2023.txt”和“dim_area.txt”文件分别加载到外部表dim_date和dim_area中
load data local inpath '/root/eduhq/data/dim_date_2023.txt' into table comm.dim_date;
load data local inpath '/root/eduhq/data/dim_area.txt' into table comm.dim_area;
分别查看外部表dim_date和dim_area的前3行数据
select * from comm.dim_date limit 3;
select * from comm.dim_area limit 3;
分别统计外部表dim_date和dim_area数据总行数
select count(*) from comm.dim_date;
-- 结果为365
select count(*) from comm.dim_area;
-- 结果为829
统计不同省份用户访问量;将统计结果导出到本地文件系统的/root/eduhq/result/ads_user_pro目录下,并指定列的分隔符为逗号
-- 第一步将查询结果写入临时表
create table default.ads_user_pro
row format delimited fields terminated by ','
as
select
province,
count(*) as cnt
from comm.dwd_behavior_log
group by province;
-- 第二步创建本地目标目录
mkdir -p /root/eduhq/result/ads_user_pro
-- 第三步将hdfs上面结果文件导入到本地
hdfs dfs -get /user/hive/warehouse/ads_user_pro/* /root/eduhq/result/ads_user_pro/
统计不同时间段的网页浏览量将统计结果导出到 本 地 文 件 系 统 的 /root/eduhq/result/ads_user_hour 目录下,并指定列的分隔符为逗号;
-- 第一步将查询结果写入临时表
create table default.ads_user_hour
row format delimited fields terminated by ','
as
select
hour(from_utc_timestamp(ts,"yyyy-MM-dd HH:mm:ss")) as diff_hour,
count(*) as cnt
from comm.dwd_behavior_log
group by hour(from_utc_timestamp(ts,"yyyy-MM-dd HH:mm:ss"));
-- 第二步创建本地目标目录
mkdir -p /root/eduhq/result/ads_user_hour
-- 第三步将hdfs上面结果文件导入到本地
hdfs dfs -get /user/hive/warehouse/ads_user_hour/* /root/eduhq/result/ads_user_hour/
不同网站访客的设备类型统计;将统计结果导出到本地文件系统的/root/eduhq/result/ads_visit_mode目录下,并指定列的分隔符为逗号;
-- 第一步将查询结果写入临时表
create table default.ads_visit_mode
row format delimited fields terminated by ','
as
select
url,
count(distinct device_type) as cnt
from comm.dwd_behavior_log
group by url ;
-- 第二步创建本地目标目录
mkdir -p /root/eduhq/result/ads_visit_mode
-- 第三步将hdfs上面结果文件导入到本地
hdfs dfs -get /user/hive/warehouse/ads_visit_mode/* /root/eduhq/result/ads_visit_mode/
不同网站的上网模式统计;将统计结果导出到本地 文 件 系 统 的 /root/eduhq/result/ads_online_type 目 录下,并指定列的分隔符为逗号;
-- 第一步将查询结果写入临时表
create table default.ads_online_type
row format delimited fields terminated by ','
as
select
url,
count(distinct type) as cnt
from comm.dwd_behavior_log
group by url ;
-- 第二步创建本地目标目录
mkdir -p /root/eduhq/result/ads_online_type
-- 第三步将hdfs上面结果文件导入到本地
hdfs dfs -get /user/hive/warehouse/ads_online_type/* /root/eduhq/result/ads_online_type/
任务七:业务分析
统计每天不同经济大区用户访问量
create table ads_user_region
row format delimited fields terminated by ','
as
SELECT
da.are AS are,
COUNT(l.province) AS num
FROM
dim_area da
LEFT JOIN
comm.dwd_behavior_log l
ON
da.province = l.province
GROUP BY
da.are;
mkdir -p /root/eduhq/result/ads_user_region
hdfs dfs -get /user/hive/warehouse/comm.db/ads_user_region/* /root/eduhq/result/ads_user_region
统计节假日和工作日的浏览量差异
-- 由于Xterminal无法输入中文所以使用英文表示,workday是指工作日,holiday是指节假日
create table ads_hol_work_user
row format delimited fields terminated by ','
as
SELECT
CASE
WHEN dd.is_workday = '0' THEN 'holiday'
ELSE 'workday'
END AS is_workday,
COUNT(l.province) AS num
FROM
dws_behavior_log l
JOIN
dim_date dd
ON
l.dt = dd.date_id
GROUP BY
dd.is_workday;
mkdir -p /root/eduhq/result/ads_hol_work_user
hdfs dfs -get /user/hive/warehouse/comm.db/ads_hol_work_user/* /root/eduhq/result/ads_hol_work_user
统计不同域名的用户访问量
create table ads_user_domian
row format delimited fields terminated by ','
as
select
url,
count(device)
from dws_behavior_log
group by url;
mkdir -p /root/eduhq/result/ads_user_domian
hdfs dfs -get /user/hive/warehouse/comm.db/ads_user_domian/* /root/eduhq/result/ads_user_domian