Loading... us-covid19-counties.dat文件: https://www.lanzouw.com/iI4Bq1zv8d9i ```sql -- 1、准备一下select语法测试环境, -- 新建数据库 CREATE DATABASE lenovo_class07_lys; -- 使用数据库 use lenovo_class07_lys; -- 2、创建表t_usa_covid19 CREATE TABLE t_usa_covid19 ( count_date STRING, county STRING, state STRING, fips INT, cases INT, deaths INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; -- 将文件上传到HDFS 下面的这个目录 -- /user/hive/warehouse/lenovo_class07_lys.db/t_usa_covid19 -- 3、将源数据load加载到t_usa_covid19表对应的路径下 LOAD DATA INPATH '/user/hive/warehouse/lenovo_class07_lys.db/t_usa_covid19/us-covid19-counties.dat' INTO TABLE t_usa_covid19; -- 4、创建一张分区表 基于count_date日期,state州进行分区, -- 在定义的时候定义除了上述两个字段外的其他四个字段, -- 用count_date日期,state进行分区。 -- 创建分区表t_usa_covid19_p: -- 权限问题:hdfs dfs -chmod -R 1777 / CREATE TABLE IF NOT EXISTS t_usa_covid19_p ( county STRING, fips INT, cases INT, deaths INT ) PARTITIONED BY (count_date STRING, state STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','; -- 5、使用动态分区插入将t_usa_covid19中的数据导入t_usa_covid19_p中 -- 设置Hive动态分区配置 SET hive.exec.dynamic.partition = true; SET hive.exec.dynamic.partition.mode = nonstrict; -- 插入数据到分区表 INSERT INTO TABLE t_usa_covid19_p PARTITION (count_date, state) SELECT county, fips, cases, deaths, count_date, state FROM t_usa_covid19; -- ============== -- 一、查询所有字段或者指定字段 -- (1)查询所有字段 SELECT * FROM t_usa_covid19_p; -- (2)查询确诊病例字段和所在州字段 SELECT cases, state FROM t_usa_covid19_p; -- 二、查询匹配正则表达式的所有字段 -- (1)查询所有以C开头的字段 SELECT county, cases FROM t_usa_covid19_p; -- (2)查询所有含有t字符的字段 SELECT count_date, state FROM t_usa_covid19_p; -- 三、查询使用函数 -- (1)统计cases列数 SELECT COUNT(cases) FROM t_usa_covid19_p; -- (2)统计cases总数 SELECT SUM(cases) FROM t_usa_covid19_p; -- 四、ALL /DISTINCT -- 1、统计cases总数 SELECT SUM(cases) FROM t_usa_covid19_p; -- 2、去掉重复行后cases总数 SELECT SUM(DISTINCT cases) FROM t_usa_covid19_p; -- 3、county,state两个字段整体去重 SELECT DISTINCT county, state FROM t_usa_covid19_p; -- 五、where -- 1、表达式 -- (1)在where后面使用2>3和2<3作为条件进行查询过滤 SELECT * FROM t_usa_covid19_p WHERE 2 < 3; -- (2)在where后面使用2+3作为条件进行查询过滤 SELECT * FROM t_usa_covid19_p WHERE 2 + 3; -- (3)在where后面使用2&3作为条件进行查询过滤 SELECT * FROM t_usa_covid19_p WHERE 2 & 3; -- (4)找出确诊病例和死亡病例的和超过50000的记录 SELECT * FROM t_usa_covid19_p WHERE cases + deaths > 50000; -- 2、函数 -- (1)找出county字母长度大于7的有哪些 SELECT * FROM t_usa_covid19_p WHERE LENGTH(county) > 7; -- 六、group by -- 1、使用group by统计死亡病例数大于10000的州 SELECT state, SUM(deaths) AS total_deaths FROM t_usa_covid19_p GROUP BY state HAVING total_deaths > 10000; -- 七、having -- 1、使用having统计死亡病例数大于10000的州 SELECT state, SUM(deaths) AS total_deaths FROM t_usa_covid19_p GROUP BY state HAVING SUM(deaths) > 10000; -- 八、limit -- (1)没有限制返回2021.1.28 Florida(州)的所有记录 SELECT * FROM t_usa_covid19_p WHERE count_date = '2021-01-28' AND state = 'Florida'; -- (2)显示2021.1.28 Florida(州)的前10条记录 SELECT * FROM t_usa_covid19_p WHERE count_date = '2021-01-28' AND state = 'Florida' LIMIT 10; -- (3)显示2021.1.28 Florida(州)的从第10条开始,共10条记录 SELECT * FROM t_usa_covid19_p WHERE count_date = '2021-01-28' AND state = 'Florida' LIMIT 10 OFFSET 10; ``` 最后修改:2024 年 05 月 27 日 © 允许规范转载 打赏 赞赏作者 赞 咱们谁跟谁,用不着~