Saturday, April 10, 2021

How To: Linear Regression for Big Data in Snowflake

BI tools like Looker will do the linear regression on small datasets and are performed in memory.   As awesome as these tools like this are, they are limited to small datasets.    And when you need to feed that data into a gradient boosting tool like XG Boost, you will most likely want to perform a linear regression in SQL.   

Below are two examples.  One that can be ran in any DB that supports CTEs and the other can be ran in Snowflake.  I have also projected out the trend line beyond the sample data to show you how to do simple projections.  Also I throw in R2 so you can evaluate the model fit. 

Sample Data

CREATE OR REPLACE TEMPORARY TABLE DATA_SAMPLE (X INTEGER,Y INTEGER);

INSERT INTO DATA_SAMPLE (X,Y)
VALUES (1,41),
       (2,963),
       (3,4506),
       (4,13557),
       (5,40422),
       (6,61484),
       (7,85075),
       (8,118436),
       (9,134457),
       (10,180253);

Generic SQL Logic

WITH X_Y_AVG AS (
    SELECT   X
           , Y
           , AVG(X) OVER() AS X_AVG
           , AVG(Y) OVER() AS Y_AVG
      FROM DATA_SAMPLE
)
, SLOPE AS (
    SELECT   SUM((X - X_AVG) * (Y - Y_AVG)) / SUM((X - X_AVG) * (X - X_AVG)) AS SLOPE
           , MAX(X_AVG) AS X_AVG_MAX
           , MAX(Y_AVG) AS Y_AVG_MAX
      FROM X_Y_AVG
)
, CALC AS (
    SELECT   SLOPE
           , Y_AVG_MAX - X_AVG_MAX * SLOPE AS INTERCEPT
      FROM SLOPE
)
, TREND_LINE AS (
        SELECT   X_Y_AVG.X
               , X_Y_AVG.Y
               , X_Y_AVG.X_AVG
               , X_Y_AVG.Y_AVG
               , CALC.SLOPE
               , CALC.INTERCEPT
               , (X_Y_AVG.X * CALC.SLOPE) + CALC.INTERCEPT AS TREND_LINE_Y
          FROM X_Y_AVG
          CROSS JOIN CALC
         UNION ALL
        SELECT   PROJECTION_X.X
               , NULL
               , PROJECTION_X.X
               , NULL
               , CALC.SLOPE
               , CALC.INTERCEPT
               , (PROJECTION_X.X * CALC.SLOPE) + CALC.INTERCEPT AS TREND_LINE_Y
          FROM (SELECT COLUMN1 AS X
                  FROM VALUES  (11),
                               (12),
                               (13),
                               (14),
                               (15),
                               (16)) AS PROJECTION_X 
          CROSS JOIN CALC
         ORDER BY X
  )
, RSQUARED AS (
    SELECT 1-(SUM(POWER(Y - TREND_LINE_Y,2)))/(SUM(POWER(Y - Y_AVG,2))) AS R2
      FROM TREND_LINE
)
SELECT   TREND_LINE.X
       , TREND_LINE.Y
       , TREND_LINE.SLOPE
       , TREND_LINE.INTERCEPT
       , TREND_LINE.TREND_LINE_Y
       , IFF(TREND_LINE.Y IS NOT NULL, (SELECT R2 FROM RSQUARED), NULL) AS R2
  FROM TREND_LINE
;    

Snowflake Specific SQL Logic Using REGR_SLOPE & REGR_INTERCEPT Functions

WITH CALC AS (
    SELECT   REGR_SLOPE(Y, X) AS SLOPE
           , REGR_INTERCEPT(Y, X) AS INTERCEPT
     FROM DATA_SAMPLE
)
SELECT   DATA_SAMPLE.X
       , DATA_SAMPLE.Y
       , CALC.SLOPE
       , CALC.INTERCEPT
       , (DATA_SAMPLE.X * CALC.SLOPE) + CALC.INTERCEPT AS TREND_LINE_Y
       , REGR_R2(DATA_SAMPLE.Y, "TREND_LINE_Y") OVER() AS R2
  FROM DATA_SAMPLE
  CROSS JOIN CALC
 UNION ALL
SELECT   PROJECTION_X.X
       , NULL
       , CALC.SLOPE
       , CALC.INTERCEPT
       , (PROJECTION_X.X * CALC.SLOPE) + CALC.INTERCEPT AS TREND_LINE_Y
       , NULL
  FROM (SELECT COLUMN1 AS X
          FROM VALUES  (11),
                       (12),
                       (13),
                       (14),
                       (15),
                       (16)) AS PROJECTION_X 
  CROSS JOIN CALC
  ORDER BY X;  



No comments: