Thursday, February 03, 2022

How To: Simple String Similarity Function in Snowflake

I created this function to help with performing a simple title matching logic for tv show meta data sourced from two different data providers.    I found it satisfactory for my needs at the time.  Obviously it may need a human in the loop to deal with titles that are so different that this process doesn't work.


CREATE OR REPLACE FUNCTION STRING_SIMILARITY (STR1 string, STR2 string, SUB_STRING_LENGTH variant, CASE_SENSITIVE boolean) 

  RETURNS VARCHAR
  LANGUAGE JAVASCRIPT
AS
$$
if (CASE_SENSITIVE == false) {
STR1 = STR1.toLowerCase();
STR2 = STR2.toLowerCase();
}
if (STR1.length < SUB_STRING_LENGTH || STR2.length < SUB_STRING_LENGTH)
return 0;
const map = new Map();
for (let i = 0; i < STR1.length - (SUB_STRING_LENGTH - 1); i++) {
const substr1 = STR1.substr(i, SUB_STRING_LENGTH);
map.set(substr1, map.has(substr1) ? map.get(substr1) + 1 : 1);
}
let match = 0;
for (let j = 0; j < STR2.length - (SUB_STRING_LENGTH - 1); j++) {
const substr2 = STR2.substr(j, SUB_STRING_LENGTH);
const count = map.has(substr2) ? map.get(substr2) : 0;
if (count > 0) {
map.set(substr2, count - 1);
match++;
}
}
return (match * 2) / (STR1.length + STR2.length - ((SUB_STRING_LENGTH - 1) * 2));
$$
;
SELECT STRING_SIMILARITY ('Startrek', 'Star Trek', 2, FALSE) ;


No comments: