func(F|func|expr) over(
[partition by F1[,F2,...] 确定分区的边界(范围)
[order by Fa [asc|desc][,Fb [asc|desc]]] 确定分区内行的排列顺序(只有指定了ORDER BY才可以使用Window_Clause)
rows|range between ... and ... rows:物理行号(无重复) range:排序号(可能重复,会将重复行的数据视为一个整体)
unbounded preceding
N preceding
current row
N following
unbounded following
)[as] ALIA(别名)
SELECT group_id,COUNT(*) AS cnt
FROM example_table
GROUP BY group_id
ORDER BY cnt DESC;
计算分割因子
SELECT FLOOR(COUNT(*)/t1.avg_cnt) AS split_factor
FROM example_table
CROSS JOIN(
SELECT AVG(cnt) AS avg_cnt
FROM(
SELECT COUNT(*) AS cnt
FROM example_table
WHERE group_id <> 2
GROUP BY group_id
WHERE group_id = 2;
形成新分组键
SELECT CONCAT_WS('_', group_id, bucket_id) as new_group_key, data
FROM (
SELECT group_id, data,
WHEN group_id = 2 THEN NTILE(split_factor) OVER (PARTITION BY group_id ORDER BY data)
ELSE 0
END as bucket_id
FROM example_table,
(SELECT FLOOR(COUNT(*) / AVG_COUNT) as split_factor
FROM example_table
CROSS JOIN (
SELECT AVG(cnt) as AVG_COUNT
FROM (
SELECT COUNT(*) as cnt
FROM example_table
WHERE group_id != 2
GROUP BY group_id
WHERE group_id = 2) t3
) result;
分析:不支持ROWS|RANGE BETWEEN,需要考虑F是否为NULL(IF(F IS NULL,NULL,...) AS ...)
-- 【词频统计】ngrams()和context_ngrams()都要与sentences()函数一起使用 SELECT ngrams(sentences('hello how are you? fine , thank you and you?'),2,3); -- 第一个参数:单词二维数组 -- 第二个参数:连续N个单词 -- 第三个参数:top-k SELECT context_ngrams(sentences('hello how are you? fine , thank you and you?'),array('how',null),3); -- 第一个参数:单词二维数组 -- 第二个参数:和how右搭配的单词词频统计 -- 第三个参数:top-k 统计分词结果中与数组指定单词一起出现的频率最高的TOP-K结果。 ** 其他搭配方式: 左侧搭配:array(null,'how'); 特定位置的搭配:array('how',null,null); 寻找和'how'隔了一个单词搭配的单词 精确序列:array('how', 'are', 'you')
{ "store": {
"book": [
{ "category": "reference",
"author": "Nigel Rees",
"title": "Sayings of the Century",
"price": 8.95
{ "category": "fiction",
"author": "Evelyn Waugh",
"title": "Sword of Honour",
"price": 12.99
提取第一本书的作者:
SELECT get_json_object(json_string,'$.store.book[0].author')
FROM TABLE_NAME;
SELECT json_tuple(json_string,'FIELD1','FIELD2') AS (col1,col2) 提取json字符串中的特定字段并作为独立的列返回
with tmp as (
SELECT json_tuple('{"name":"张三","hobbies":["beauty","money","power"],"address":{"province":"江苏","city":"南京"}}',
'name', 'hobbies', 'address') as (name, hobbies, address)
如果提取的字段不可以直接作为独立的列,则可先作为临时表。
SELECT name,
get_json_object(address,'`$`.province') as province,
get_json_object(address,'`$`.city') as city,
hobby
from tmp
lateral view explode(split(regexp_replace(hobbies,'\\[|]|"',''),','))V as hobby;
name province city hobbies 张三 江苏 南京 beauty 张三 江苏 南京 money 张三 江苏 南京 power
with tmp as (
SELECT json_tuple('{"name":"张三","hobbies":["beauty","money","power"],"address":{"province":"江苏","city":"南京"}}',
'name', 'hobbies', 'address') as (name, hobbies, address)
), tmp2 as (
SELECT name,
-- get_json_object(address,'`$`.province') as province,
-- get_json_object(address,'`$`.city') as city,
regexp_extract(address,'\\{"province":"(.*?)","city":"(.*?)"}',1) as province,
regexp_extract(address,'\\{"province":"(.*?)","city":"(.*?)"}',2) as city
regexp_replace(hobbies,'\\[|]|"','') as hobbies
FROM tmp
name province city hobbies 张三 江苏 南京 beauty,money,power
如果没有将hobbies由一行转多列,如何实现类似"查询所有爱好为beauty的用户"的查询?
SELECT * FROM tmp2
-- WHERE find_in_set('beauty',hobbies)>0;
-- WHERE locate('beauty',hobbies)>0;
-- WHERE hobbies RLIKE '.*beauty.*';
WITH tmp AS (
SELECT parse_url_tuple('https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BA%E6%89%8B%E6%9C%BAmate60&enc=utf-8&suggest=1.def.0.SAK7|MIXTAG_SAK7R,SAK7_M_AM_L5366,SAK7_M_GUD_R,SAK7_S_AM_R,SAK7_D_HSP_L30657,SAK7_SC_PD_R,SAK7_SM_PB_R,SAK7_SM_PRK_R,SAK7_SM_PRC_R,SAK7_SM_PRR_LC,SAK7_SS_PM_R|&wq=%E5%8D%8E%E4%B8%BA&pvid=65c357d9dfb44555a9eb8708ca539b8b','QUERY','keyword') AS keyword,
parse_url_tuple('https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BA%E6%89%8B%E6%9C%BAmate60&enc=utf-8&suggest=1.def.0.SAK7|MIXTAG_SAK7R,SAK7_M_AM_L5366,SAK7_M_GUD_R,SAK7_S_AM_R,SAK7_D_HSP_L30657,SAK7_SC_PD_R,SAK7_SM_PB_R,SAK7_SM_PRK_R,SAK7_SM_PRC_R,SAK7_SM_PRR_LC,SAK7_SS_PM_R|&wq=%E5%8D%8E%E4%B8%BA&pvid=65c357d9dfb44555a9eb8708ca539b8b','QUERY','keyword') AS enc
SELECT reflect('java.net.URLDecoder','decode',keyword, if(enc is null,'UTF-8',enc)) AS keyword FROM tmp;
SELECT parse_url_tuple('https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BA%E6%89%8B%E6%9C%BAmate60&enc=utf-8&suggest=1.def.0.SAK7|MIXTAG_SAK7R,SAK7_M_AM_L5366,SAK7_M_GUD_R,SAK7_S_AM_R,SAK7_D_HSP_L30657,SAK7_SC_PD_R,SAK7_SM_PB_R,SAK7_SM_PRK_R,SAK7_SM_PRC_R,SAK7_SM_PRR_LC,SAK7_SS_PM_R|&wq=%E5%8D%8E%E4%B8%BA&pvid=65c357d9dfb44555a9eb8708ca539b8b','PROTOCOL','QUERY')
SELECT if(true,1,0);
SELECT in_file('ariel','/root/hive/data/course/hive_func_in_file.data'); -- 判断 某个表中某个字段的值|指定内容 是否出现在指定文件中
SELECT isfalse();
SELECT isfalse(0); => true
SELECT istrue();
SELECT isnull();
SELECT not(); -- 取相反的情况
SELECT nullif(2,3); -- 如果两个参数相等,返回NULL;如果不相等,返回第一个参数。
避免除零错误:
SELECT col1, col2, col1/NULLIF(col2, 0) AS result(任何涉及NULL的结果都是NULL)
FROM TABLE_NAME;
SELECT nvl(null,3); -- 返回第一个非NULL参数的值(只能有两个参数)
替换NULL值为默认值:
SELECT NVL(col,'默认值') AS new_col
FROM TABLE_NAME;
聚合函数中处理NULL值:
SELECT SUM(NVL(col,0)) AS total
FROM TABLE_NAME;
SELECT coalesce(null,null,...,5); -- 返回第一个非NULL参数的值(可以有多个参数)
SELECT case f1 when V1 then ... when v2 then ... else vn end;
SELECT case when f1>=v1 then ... when f1>=v2 then ... else ... end;
WITH tmp AS(
SELECT named_struct('name','me','age',18,'is_member',true) as self,
array(
named_struct('name','henry','age',22,'is_member',true),
named_struct('name','pola','age',20,'is_member',true),
named_struct('name','ariel','age',16,'is_member',false)
) AS array_struct
SELECT self.name,self.age,self.is_member,name,age,is_member
FROM tmp
LATERAL VIEW inline(array_struct)V AS name,age,is_member;
self.name self.age self.is_member name age is_member me 18 true henry 22 true me 18 true pola 20 true me 18 true ariel 16 false