天天看點

資料庫自定義聚合函數(求和、标準差、平均值、幾何平均值、幾何标準差、偏度系數、峰度系數)

/*
 * stats_agg 是 PostgreSQL 的聚合函數工作方式類似于先前存在的聚集體(min(x),max(),avg()等),但計算一
 * 次,并傳回他們都多的統計資料。傳回的統計資料是count,min,max,mean,variance,skewness,和 kurtosis,但其他人可以加入。
 * 
 * 我需要一個傳回偏度和峰度的聚合,而不是為每個需要多次傳遞來計算平均值等的單獨函數,我認為最好有一個聚合在一次傳遞中傳回所有内容。
 * 
 * 感謝 John D. Cook 和他的部落格文章(https://www.johndcook.com/blog/skewness_kurtosis/)解釋了如何做到這一點。
 * */

CREATE TYPE _stats_agg_accum_type AS (
	n BIGINT,
	min DOUBLE PRECISION,
	max DOUBLE PRECISION,
	m1 DOUBLE PRECISION,
	m2 DOUBLE PRECISION,
	m3 DOUBLE PRECISION,
	m4 DOUBLE PRECISION
);

CREATE TYPE _stats_agg_result_type AS (
	count BIGINT,
	min DOUBLE PRECISION,
	max DOUBLE PRECISION,
	mean DOUBLE PRECISION,
	variance DOUBLE PRECISION,
	skewness DOUBLE PRECISION,
	kurtosis DOUBLE PRECISION
);

CREATE OR REPLACE FUNCTION _stats_agg_accumulator(_stats_agg_accum_type, DOUBLE PRECISION)
RETURNS _stats_agg_accum_type AS 
$example_table$
DECLARE
	a ALIAS FOR $1;
	x ALIAS FOR $2;
	n1 BIGINT;
	delta DOUBLE PRECISION;
	delta_n DOUBLE PRECISION;
	delta_n2 DOUBLE PRECISION;
	term1 DOUBLE PRECISION;
BEGIN
	IF x IS NOT NULL THEN
		n1 = a.n;
		a.n = a.n + 1;
		delta = x - a.m1;
		delta_n = delta / a.n;
		delta_n2 = delta_n * delta_n;
		term1 = delta * delta_n * n1;
		a.m1 = a.m1 + delta_n;
		a.m4 = a.m4 + term1 * delta_n2 * (a.n*a.n - 3*a.n + 3) + 6 * delta_n2 * a.m2 - 4 * delta_n * a.m3;
		a.m3 = a.m3 + term1 * delta_n * (a.n - 2) - 3 * delta_n * a.m2;
		a.m2 = a.m2 + term1;
		a.min = LEAST(a.min, x);
		a.max = GREATEST(a.max, x);
	END IF;
	
	RETURN a;
END;
$example_table$
LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION _stats_agg_finalizer(_stats_agg_accum_type)
RETURNS _stats_agg_result_type AS 
$example_table$
BEGIN
	RETURN ROW(
		$1.n, 
		$1.min,
		$1.max,
		$1.m1,
		$1.m2 / NULLIF(($1.n - 1.0), 0), 
		CASE WHEN $1.m2 = 0 THEN NULL ELSE SQRT($1.n) * $1.m3 / NULLIF(($1.m2 ^ 1.5), 0) END, 
		CASE WHEN $1.m2 = 0 THEN NULL ELSE $1.n * $1.m4 / NULLIF(($1.m2 * $1.m2) - 3.0, 0) END
	);
END;
$example_table$
LANGUAGE plpgsql;

CREATE AGGREGATE stats_agg(DOUBLE PRECISION) (
	sfunc = _stats_agg_accumulator,
	stype = _stats_agg_accum_type,
	finalfunc = _stats_agg_finalizer,
	initcond = '(0,,, 0, 0, 0, 0)'
);


/**
 * 自定義函數
 * 以下是根據具體項目需求和需求計算公式進行聚合(非标準聚合公式)
 */
/* 求和 */
CREATE OR REPLACE FUNCTION sum(arr DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION
LANGUAGE plpgsql AS 
$function$
declare
    ret DOUBLE PRECISION;
    item DOUBLE PRECISION;
    a DOUBLE PRECISION = 0;
begin
    FOREACH item IN ARRAY arr
    LOOP
        a = a + item;
    END LOOP;
    ret = a;
    RETURN ret;
END;
$function$
;

/* 标準差 */
CREATE OR REPLACE FUNCTION stddev(arr DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION
LANGUAGE plpgsql AS 
$function$
DECLARE
    ret DOUBLE PRECISION;
    item DOUBLE PRECISION;
    mean DOUBLE PRECISION;
    a DOUBLE PRECISION = 0;
    c INT = 0;
BEGIN
    c = ARRAY_LENGTH(arr,1);
    mean = MEAN(arr);
    FOREACH item IN ARRAY arr
    LOOP
        a = a + POW(item - mean, 2);
    END LOOP;
    ret = POW(a/(c - 1), 0.5);
    RETURN ret;
END;
$function$
;

/* 平均值 */
CREATE OR REPLACE FUNCTION mean(arr DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION
LANGUAGE plpgsql AS 
$function$
DECLARE
    ret DOUBLE PRECISION;
    item DOUBLE PRECISION;
    a DOUBLE PRECISION = 0;
    c INT = 0;
BEGIN
    c = ARRAY_LENGTH(arr, 1);
    FOREACH item IN ARRAY arr
    LOOP
        a = a + item;
    END LOOP;
    ret = a / c ;
    RETURN ret;
END;
$function$
;

/*幾何平均值*/
CREATE OR REPLACE FUNCTION geo_mean(arr DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION
LANGUAGE plpgsql AS 
$function$
DECLARE
    ret DOUBLE PRECISION;
    item DOUBLE PRECISION;
    a DOUBLE PRECISION = 0;
    c INT;
BEGIN
    FOREACH item IN ARRAY arr
    LOOP
        IF a = 0 THEN
            a = item;
        ELSE
            a = A * ITEM;
        END IF;
    END LOOP;
    c = ARRAY_LENGTH(arr,1);
    ret = POW(a, 1.0 / c);
    RETURN ret;
END;
$function$
;

/* 幾何标準差 */
CREATE OR REPLACE FUNCTION geo_std(arr DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION
LANGUAGE plpgsql AS 
$function$
declare
    ret DOUBLE PRECISION;
    item DOUBLE PRECISION;
    a DOUBLE PRECISION = 0;
    b DOUBLE PRECISION = 0;
    base NUMERIC = 10.0;
    c INT = 0;
BEGIN
    FOREACH item IN ARRAY arr
    LOOP
        a = a + POW(LOG(base, item::NUMERIC), 2.0);
        b = b + LOG(base, item::NUMERIC);
    END LOOP;
    b = POW(b, 2);
    c = ARRAY_LENGTH(arr, 1);
    ret = 10 ^ (POW(((a - b / c) / (c - 1)), 0.5));
    RETURN ret;
END;
$function$
;

/* 偏度系數 */
CREATE OR REPLACE FUNCTION skewness(arr DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION
LANGUAGE plpgsql AS 
$function$
DECLARE
    ret DOUBLE PRECISION;
    item DOUBLE PRECISION;
    a DOUBLE PRECISION = 0;
    mean DOUBLE PRECISION = 0;
    std_val DOUBLE PRECISION = 0;
    c INT = 0;
BEGIN
    c = ARRAY_LENGTH(arr,1);
    mean = MEAN(arr);
    std_val = STDDEV(arr);
    FOREACH item IN ARRAY arr
    LOOP
        a = a + (POW(item - mean, 3) / POW(std_val, 3));
    END LOOP;
    ret = ((1 / (c - 1)::DOUBLE PRECISION) * a);
    RETURN ret;
END;
$function$
;

/* 峰度系數 */
CREATE OR REPLACE FUNCTION kurtosis(arr DOUBLE PRECISION[])
RETURNS DOUBLE PRECISION
LANGUAGE plpgsql AS 
$function$
DECLARE
    ret DOUBLE PRECISION;
    item DOUBLE PRECISION;
    a DOUBLE PRECISION = 0;
    mean DOUBLE PRECISION = 0;
    std_val DOUBLE PRECISION = 0;
    c INT = 0;
BEGIN
    c = ARRAY_LENGTH(arr,1);
    mean = MEAN(arr);
    std_val = STDDEV(arr);
    FOREACH item IN ARRAY arr
    LOOP
        a = a + (POW(item - mean, 4) / POW(std_val, 4));
    END LOOP;
    ret = ((1 / (c-1)::DOUBLE PRECISION) * a) - 3;
    RETURN ret;
END;
$function$
;
           

自定義函數中用到的幾個公式

 1、 平均值

資料庫自定義聚合函數(求和、标準差、平均值、幾何平均值、幾何标準差、偏度系數、峰度系數)

2、标準差

資料庫自定義聚合函數(求和、标準差、平均值、幾何平均值、幾何标準差、偏度系數、峰度系數)

3、幾何平均值

資料庫自定義聚合函數(求和、标準差、平均值、幾何平均值、幾何标準差、偏度系數、峰度系數)

 4、幾何标準差

資料庫自定義聚合函數(求和、标準差、平均值、幾何平均值、幾何标準差、偏度系數、峰度系數)

5、偏度系數

資料庫自定義聚合函數(求和、标準差、平均值、幾何平均值、幾何标準差、偏度系數、峰度系數)

6、峰度系數

資料庫自定義聚合函數(求和、标準差、平均值、幾何平均值、幾何标準差、偏度系數、峰度系數)

Java代碼實作commons-math之常用科學計算