create or replace function get_bitcount_and(
and_tagids int[], -- 输入标签ID数组
v_bit int, -- 求1或0的比特个数
conn text, -- 连接串
OUT cnt int8 -- 返回值, 多少个1或0
) returns setof int8 as $$
declare
begin
for i in 0..399 loop -- 生成400个链接,因为每行5000万,20亿个BIT,刚好400条。并LOOP
perform conn('link'||i, conn); -- 连接
perform dblink_get_result('link'||i); -- 消耗掉上一次异步连接的结果,否则会报错。
-- 发送异步DBLINK调用
-- 每次操作一个bit分段,返回BIT为0或1的位数
perform dblink_send_query('link'||i, format('select bit_count(bit_and(v), %s) from t_bitmap where tagid = any (%L) and ofid=%s', v_bit, and_tagids, i));
end loop;
for i in 0..399 loop
-- 返回异步调用结果,包括所有分段
return query SELECT * FROM dblink_get_result('link'||i) as t(cnt int8);
end loop;
$$ language plpgsql strict;
7、OR标签组合的并行计算函数(dblink 异步并行),返回USERID透视数。
create or replace function get_bitcount_or(
or_tagids int[],
v_bit int,
conn text, -- 连接串
OUT cnt int8
) returns setof int8 as $$
declare
begin
for i in 0..399 loop
perform conn('link'||i, conn);
perform dblink_get_result('link'||i);
perform dblink_send_query('link'||i, format('select bit_count(bit_or(v), %s) from t_bitmap where tagid = any (%L) and ofid=%s', v_bit, or_tagids, i));
end loop;
for i in 0..399 loop
return query SELECT * FROM dblink_get_result('link'||i) as t(cnt int8);
end loop;
$$ language plpgsql strict;
8、AND,OR 标签组合的并行计算函数(dblink 异步并行),返回USERID透视数。
create or replace function get_bitcount_and_or(
and_tagids int[],
or_tagids int[],
v_bit int,
conn text, -- 连接串
OUT cnt int8
) returns setof int8 as $$
declare
begin
for i in 0..399 loop
perform conn('link'||i, conn);
perform dblink_get_result('link'||i);
perform dblink_send_query('link'||i, format('
with t1 as (select bit_and(v) b from t_bitmap where tagid = any (%L) and ofid=%s),
t2 as (select bit_or(v) b from t_bitmap where tagid = any (%L) and ofid=%s)
select bit_count(bitor(t1.b, t2.b), %s) from t1,t2',
and_tagids, i, or_tagids, i, v_bit));
end loop;
for i in 0..399 loop
return query SELECT * FROM dblink_get_result('link'||i) as t(cnt int8);
end loop;
$$ language plpgsql strict;
-- 更复杂的QUERY,可以自行修改函数。实际业务中这种需求较少。
-- (a and b andc or d) or (a and c) or (d and not b)..........
9、计数透视的性能如下,50个标签组合,仅1.5秒,100个标签组合,仅2.6秒:
我们统计2000亿个user_tags组合(每个用户一条记录,每条记录1000个标签时的换算),仅仅需要2.6秒。
一个标签:
postgres=# select sum(cnt) from (select get_bitcount_and(array_agg(id),1,'dbname=postgres user=postgres') cnt from generate_series(1,1) t(id)) t;
------------
2000000000
(1 row)
Time: 791.392 ms
10个标签组合:
postgres=# select sum(cnt) from (select get_bitcount_and(array_agg(id),1,'dbname=postgres user=postgres') cnt from generate_series(1,10) t(id)) t;
------------
2000000000
(1 row)
Time: 847.427 ms
50个标签组合:
postgres=# select sum(cnt) from (select get_bitcount_and(array_agg(id),1,'dbname=postgres user=postgres') cnt from generate_series(1,50) t(id)) t;
------------
2000000000
(1 row)
Time: 1478.847 ms (00:01.479)
100个标签组合:
postgres=# select sum(cnt) from (select get_bitcount_and(array_agg(id),1,'dbname=postgres user=postgres') cnt from generate_series(1,100) t(id)) t;
------------
2000000000
(1 row)
Time: 2574.761 ms (00:02.575)
10、AND 、 OR组合性能如下,性能一样:
postgres=# select sum(cnt) from (select get_bitcount_and_or(array_agg(case mod(id,2) when 0 then id end), array_agg(case mod(id,2) when 1 then id end), 1,'dbname=postgres user=postgres') cnt from generate_series(1,1) t(id)) t;
-----
(1 row)
Time: 854.934 ms
postgres=# select sum(cnt) from (select get_bitcount_and_or(array_agg(case mod(id,2) when 0 then id end), array_agg(case mod(id,2) when 1 then id end), 1,'dbname=postgres user=postgres') cnt from generate_series(1,10) t(id)) t;
------------
2000000000
(1 row)
Time: 889.472 ms
postgres=# select sum(cnt) from (select get_bitcount_and_or(array_agg(case mod(id,2) when 0 then id end), array_agg(case mod(id,2) when 1 then id end), 1,'dbname=postgres user=postgres') cnt from generate_series(1,50) t(id)) t;
------------
2000000000
(1 row)
Time: 1519.031 ms (00:01.519)
postgres=# select sum(cnt) from (select get_bitcount_and_or(array_agg(case mod(id,2) when 0 then id end), array_agg(case mod(id,2) when 1 then id end), 1,'dbname=postgres user=postgres') cnt from generate_series(1,100) t(id)) t;
------------
2000000000
(1 row)
Time: 2597.701 ms (00:02.598)
11、求USERID,AND 函数如下,我们为了达到高速响应,使用游标返回。
create or replace function get_pos_and(
and_tagids int[], -- 标签组合
v_bit int -- 求1或0的BIT位,返回游标,游标包含ofid与位置下标(当然了,这个翻译动作也可以交给程序,那么返回BIT和ofid即可)
) returns setof refcursor as $$
declare
ref refcursor[]; -- 返回游标数组
res refcursor; -- 返回游标
sql text; -- 游标对应的SQL,即取USERID位置的SQL
begin
for x in 1..400 loop -- 生成400个游标名
ref[x] := 'cur'||x;
end loop;
for i in 0..399 loop
-- 使用0到399的偏移值, 乘以5000万系数。
-- 赋予游标名
res := ref[i+1];
-- 生成游标对应的动态SQL(ofid, bit位置),注意bit位置可以不翻译,交给程序翻译也没问题。程序翻译的话,翻译好之后,再使用in查询字典
-- select uid from uid_mapping where pos in (pos_array);
-- 1亿,in 100万, 380毫秒
-- [《HTAP数据库 PostgreSQL 场景与性能测试之 25 - (OLTP) IN , EXISTS 查询》](201711/20171107_26.md)
sql := format('select %s, bit_posite(bit_and(v), %s, true) from t_bitmap where tagid = any (%L) and ofid=%s', i, v_bit, and_tagids, i);
-- 打开游标
open res for execute sql ;
-- 返回游标
return next res;
end loop;
$$ language plpgsql strict;
12、求USERID,OR 函数如下,我们为了达到高速响应,使用游标返回。
create or replace function get_pos_or(
or_tagids int[],
v_bit int
) returns setof refcursor as $$
declare
ref refcursor[];
res refcursor;
sql text;
begin
for x in 1..400 loop
ref[x] := 'cur'||x;
end loop;
for i in 0..399 loop
res := ref[i+1];
sql := format('select %s, bit_posite(bit_or(v), %s, true) from t_bitmap where tagid = any (%L) and ofid=%s', i, v_bit, or_tagids, i);
open res for execute sql ;
return next res;
end loop;
$$ language plpgsql strict;
13、求USERID,AND OR 函数如下,我们为了达到高速响应,使用游标返回。
create or replace function get_pos_and_or(
and_tagids int[],
or_tagids int[],
v_bit int
) returns setof refcursor as $$
declare
ref refcursor[];
res refcursor;
sql text;
begin
for x in 1..400 loop
ref[x] := 'cur'||x;
end loop;
for i in 0..399 loop
res := ref[i+1];
sql := format('with t1 as
(select bit_and(v) v from t_bitmap where tagid = any (%L) and ofid=%s),
t2 as
(select bit_or(v) v from t_bitmap where tagid = any (%L) and ofid=%s)
select %s, bit_posite(bitor(t1.v, t2.v), %s, true) from t1,t2',
and_tagids, i, or_tagids, i, i, v_bit);
open res for execute sql ;
return next res;
end loop;
$$ language plpgsql strict;
14、求USERID例子,88毫秒响应,极端速度。
postgres=# begin;
BEGIN
Time: 0.031 ms
postgres=# select * from get_pos_and_or(array[1,2,3], array[4,5,6], 1);
get_pos_and_or
----------------
cur399
cur400
(400 rows)
Time: 88.069 ms
获取游标值,5000万ID,仅692毫秒:
fetch 1 from cur1;
Time: 692.408 ms
15、如果我们把位置翻译放到客户端做,那么只需要获取结果BITMAP,那就更快了,224毫秒就可以获取5000万BIT走。 这块也能做成并发,每个客户端获取不同的ofid。
CREATE OR REPLACE FUNCTION public.get_pos_and(and_tagids integer[])
RETURNS SETOF refcursor
LANGUAGE plpgsql
STRICT
AS $function$
declare
ref refcursor[];
res refcursor;
sql text;
begin
for x in 1..400 loop
ref[x] := 'cur'||x;
end loop;
for i in 0..399 loop
res := ref[i+1];
-- sql := format('select %s, bit_posite(bit_and(v), %s, true) from t_bitmap where tagid = any (%L) and ofid=%s', i, v_bit, and_tagids, i);
sql := format('select %s, bit_and(v) from t_bitmap where tagid = any (%L) and ofid=%s', i, and_tagids, i);
open res for execute sql ;
return next res;
end loop;
$function$;
postgres=# \timing
Timing is on.
postgres=# begin;
BEGIN
Time: 0.045 ms
postgres=# select get_pos_and(array_agg(id)) from generate_series(1,100) t(id);
get_pos_and
-------------
cur397
cur398
cur399
cur400
(400 rows)
fetch 1 from cur1;
Time: 224.776 ms
16、如果要求包含某标签,但是不包含某标签的用户,同样使用BIT操作即可。