us-covid19-counties.dat文件:
https://www.lanzouw.com/iI4Bq1zv8d9i
-- 1、准备一下select语法测试环境,
-- 新建数据库
CREATE DATABASE lenovo_class07_lys;
-- 使用数据库
use lenovo_class07_lys;
-- 2、创建表t_usa_covid19
CREATE TABLE t_usa_covid19 (
count_date STRING,
county STRING,
state STRING,
fips INT,
cases INT,
deaths INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
-- 将文件上传到HDFS 下面的这个目录
-- /user/hive/warehouse/lenovo_class07_lys.db/t_usa_covid19
-- 3、将源数据load加载到t_usa_covid19表对应的路径下
LOAD DATA INPATH '/user/hive/warehouse/lenovo_class07_lys.db/t_usa_covid19/us-covid19-counties.dat' INTO TABLE t_usa_covid19;
-- 4、创建一张分区表 基于count_date日期,state州进行分区,
-- 在定义的时候定义除了上述两个字段外的其他四个字段,
-- 用count_date日期,state进行分区。
-- 创建分区表t_usa_covid19_p:
-- 权限问题:hdfs dfs -chmod -R 1777 /
CREATE TABLE IF NOT EXISTS t_usa_covid19_p (
county STRING,
fips INT,
cases INT,
deaths INT
)
PARTITIONED BY (count_date STRING, state STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ',';
-- 5、使用动态分区插入将t_usa_covid19中的数据导入t_usa_covid19_p中
-- 设置Hive动态分区配置
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
-- 插入数据到分区表
INSERT INTO TABLE t_usa_covid19_p
PARTITION (count_date, state)
SELECT county, fips, cases, deaths, count_date, state
FROM t_usa_covid19;
-- ==============
-- 一、查询所有字段或者指定字段
-- (1)查询所有字段
SELECT * FROM t_usa_covid19_p;
-- (2)查询确诊病例字段和所在州字段
SELECT cases, state FROM t_usa_covid19_p;
-- 二、查询匹配正则表达式的所有字段
-- (1)查询所有以C开头的字段
SELECT county, cases FROM t_usa_covid19_p;
-- (2)查询所有含有t字符的字段
SELECT count_date, state FROM t_usa_covid19_p;
-- 三、查询使用函数
-- (1)统计cases列数
SELECT COUNT(cases) FROM t_usa_covid19_p;
-- (2)统计cases总数
SELECT SUM(cases) FROM t_usa_covid19_p;
-- 四、ALL /DISTINCT
-- 1、统计cases总数
SELECT SUM(cases) FROM t_usa_covid19_p;
-- 2、去掉重复行后cases总数
SELECT SUM(DISTINCT cases) FROM t_usa_covid19_p;
-- 3、county,state两个字段整体去重
SELECT DISTINCT county, state FROM t_usa_covid19_p;
-- 五、where
-- 1、表达式
-- (1)在where后面使用2>3和2<3作为条件进行查询过滤
SELECT * FROM t_usa_covid19_p WHERE 2 < 3;
-- (2)在where后面使用2+3作为条件进行查询过滤
SELECT * FROM t_usa_covid19_p WHERE 2 + 3;
-- (3)在where后面使用2&3作为条件进行查询过滤
SELECT * FROM t_usa_covid19_p WHERE 2 & 3;
-- (4)找出确诊病例和死亡病例的和超过50000的记录
SELECT * FROM t_usa_covid19_p WHERE cases + deaths > 50000;
-- 2、函数
-- (1)找出county字母长度大于7的有哪些
SELECT * FROM t_usa_covid19_p WHERE LENGTH(county) > 7;
-- 六、group by
-- 1、使用group by统计死亡病例数大于10000的州
SELECT state, SUM(deaths) AS total_deaths
FROM t_usa_covid19_p
GROUP BY state
HAVING total_deaths > 10000;
-- 七、having
-- 1、使用having统计死亡病例数大于10000的州
SELECT state, SUM(deaths) AS total_deaths
FROM t_usa_covid19_p
GROUP BY state
HAVING SUM(deaths) > 10000;
-- 八、limit
-- (1)没有限制返回2021.1.28 Florida(州)的所有记录
SELECT * FROM t_usa_covid19_p
WHERE count_date = '2021-01-28' AND state = 'Florida';
-- (2)显示2021.1.28 Florida(州)的前10条记录
SELECT * FROM t_usa_covid19_p
WHERE count_date = '2021-01-28' AND state = 'Florida'
LIMIT 10;
-- (3)显示2021.1.28 Florida(州)的从第10条开始,共10条记录
SELECT * FROM t_usa_covid19_p
WHERE count_date = '2021-01-28' AND state = 'Florida'
LIMIT 10 OFFSET 10;