Home » SQL & PL/SQL » SQL & PL/SQL » Jaro-Winkler similarity for 9i
Jaro-Winkler similarity for 9i [message #486569] Thu, 16 December 2010 16:09 Go to next message
Barbara Boehmer
Messages: 9090
Registered: November 2002
Location: California, USA
Senior Member
I received an email request for a pl/sql function for 9i that would do the same thing as utl_match.jaro_winkler_similarity does in 10g and 11g, so I wrote the following code and test. I thought I would share it here for anyone else who might want it or for anybody to suggest improvements for the code.

CREATE OR REPLACE FUNCTION jws -- Jaro-Winkler similarity
  (p_string1     IN VARCHAR2,
   p_string2     IN VARCHAR2)
  RETURN            NUMBER
  DETERMINISTIC
AS
  v_closeness       NUMBER := 0;
  v_temp            VARCHAR2 (32767);
  v_comp1           VARCHAR2 (32767);
  v_comp2           VARCHAR2 (32767);
  v_matches         NUMBER := 0; 
  v_char            VARCHAR2 (1);
  v_transpositions  NUMBER := 0;
  v_d_jaro          NUMBER := 0;
  v_leading         NUMBER := 0;
  v_d_winkler       NUMBER := 0;
  v_jws             NUMBER := 0;
BEGIN
  -- check for null strings:
  IF p_string1 IS NULL OR p_string2 IS NULL THEN 
    RETURN 0;
  END IF;
  -- closeness:
  v_closeness := (GREATEST (LENGTH (p_string1), LENGTH (p_string2)) / 2) - 1;
  -- find matching characters and transpositions within closeness:
  v_temp := p_string2;
  FOR i IN 1 .. LENGTH (p_string1) LOOP
    IF INSTR (v_temp, SUBSTR (p_string1, i, 1)) > 0 THEN
      v_char := SUBSTR (p_string1, i, 1);
      IF ABS (INSTR (p_string1, v_char) - INSTR (p_string2, v_char)) <= v_closeness THEN
        v_comp1 := v_comp1 || SUBSTR (p_string1, i, 1);
        v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (p_string1, i, 1)) - 1)
               || SUBSTR (v_temp, INSTR (v_temp, SUBSTR (p_string1, i, 1)) + 1);
      END IF;
    END IF;    
  END LOOP;
  v_temp := p_string1;
  FOR i IN 1 .. LENGTH (p_string2) LOOP
    IF INSTR (v_temp, SUBSTR (p_string2, i, 1)) > 0 THEN
      v_char := SUBSTR (p_string2, i, 1);
      IF ABS (INSTR (p_string2, v_char) - INSTR (p_string1, v_char)) <= v_closeness THEN
        v_comp2 := v_comp2 || SUBSTR (p_string2, i, 1);
        v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (p_string2, i, 1)) - 1)
               || SUBSTR (v_temp, INSTR (v_temp, SUBSTR (p_string2, i, 1)) + 1);
      END IF;
    END IF;    
  END LOOP;
  -- check for null strings:
  IF v_comp1 IS NULL OR v_comp2 IS NULL THEN 
    RETURN 0;
  END IF;
  -- count matches and transpositions within closeness:
  FOR i IN 1 .. LEAST (LENGTH (v_comp1), LENGTH (v_comp2)) LOOP
    IF SUBSTR (v_comp1, i, 1) = SUBSTR (v_comp2, i, 1) THEN
      v_matches := v_matches + 1;
    ELSE
      v_char := SUBSTR (v_comp1, i, 1);
      IF ABS (INSTR (p_string1, v_char) - INSTR (p_string2, v_char)) <= v_closeness THEN
        v_transpositions := v_transpositions + 1;
        v_matches := v_matches + 1;
      END IF; 
    END IF;
  END LOOP;
  v_transpositions := v_transpositions / 2;
  -- check for no matches:
  IF v_matches = 0
    THEN RETURN 0;
  END IF;
  -- Jaro:
  v_d_jaro := ((v_matches / LENGTH (p_string1)) + 
               (v_matches / LENGTH (p_string2)) +
               ((v_matches - v_transpositions) / v_matches)) 
               / 3;
  -- count matching leading characters (up to 4):
  FOR i IN 1 .. LEAST (LENGTH (p_string1), LENGTH (p_string2), 4) LOOP
    IF SUBSTR (p_string1, i, 1) = SUBSTR (p_string2, i, 1) THEN
      v_leading := v_leading + 1;
    ELSE
      EXIT;
    END IF;
  END LOOP;
  -- Winkler:
  v_d_winkler := v_d_jaro + ((v_leading * .1) * (1 - v_d_jaro));
  -- Jaro-Winkler similarity rounded:
  v_jws := ROUND (v_d_winkler * 100);
  RETURN v_jws;
END jws;
/


SCOTT@orcl_11gR2> WITH
  2    strings AS
  3  	 (SELECT NULL	      string1, NULL	    string2 FROM DUAL UNION ALL
  4  	  SELECT 'test'       string1, NULL	    string2 FROM DUAL UNION ALL
  5  	  SELECT NULL	      string1, 'test'	    string2 FROM DUAL UNION ALL
  6  	  SELECT 'CRATE'      string1, 'TRACE'	    string2 FROM DUAL UNION ALL
  7  	  SELECT 'MARTHA'     string1, 'MARHTA'     string2 FROM DUAL UNION ALL
  8  	  SELECT 'DWAYNE'     string1, 'DUANE'	    string2 FROM DUAL UNION ALL
  9  	  SELECT 'DIXON'      string1, 'DICKSONX'   string2 FROM DUAL UNION ALL
 10  	  SELECT 'Dunningham' string1, 'Cunningham' string2 FROM DUAL UNION ALL
 11  	  SELECT 'Abroms'     string1, 'Abrams'     string2 FROM DUAL UNION ALL
 12  	  SELECT 'Lampley'    string1, 'Campley'    string2 FROM DUAL UNION ALL
 13  	  SELECT 'Jonathon'   string1, 'Jonathan'   string2 FROM DUAL UNION ALL
 14  	  SELECT 'Jeraldine'  string1, 'Gerladine'  string2 FROM DUAL UNION ALL
 15  	  SELECT 'test'       string1, 'blank'	    string2 FROM DUAL UNION ALL
 16  	  SELECT 'everybody'  string1, 'every'	    string2 FROM DUAL UNION ALL
 17  	  SELECT 'a'	      string1, 'aaa'	      string2 FROM DUAL)
 18  SELECT string1, string2,
 19  	    UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
 20  	    jws (string1, string2) my_jws
 21  FROM   strings
 22  ORDER  BY my_jws DESC
 23  /

STRING1    STRING2    ORACLE_JWS     MY_JWS
---------- ---------- ---------- ----------
MARTHA     MARHTA             96         96
Jonathon   Jonathan           95         95
Dunningham Cunningham         93         93
Abroms     Abrams             92         92
everybody  every              91         91
Lampley    Campley            90         90
Jeraldine  Gerladine          88         88
DWAYNE     DUANE              84         84
DIXON      DICKSONX           81         81
a          aaa                80         80
CRATE      TRACE              73         73
test       blank               0          0
test                           0          0
           test                0          0
                               0          0

15 rows selected.

SCOTT@orcl_11gR2>

Re: Jaro-Winkler similarity for 9i [message #486609 is a reply to message #486569] Fri, 17 December 2010 04:58 Go to previous messageGo to next message
mnitu
Messages: 159
Registered: February 2008
Location: Reims
Senior Member
Hi Barbara,
Thanks for sharing your function, that's very nice. I have two remarks:
a) It seems that there are some difference between Oracle version and your version concerning accents.
Connected to Oracle Database 10g Enterprise Edition Release 10.2.0.4.0 
Connected as mni
 
SQL> 
SQL> WITH strings AS (
  2     SELECT 'Géraldine'  string1, 'Gerladine'  string2 FROM DUAL UNION ALL
  3     SELECT 'Jérôme'  string1, 'Jerome'  string2 FROM DUAL UNION ALL
  4     SELECT 'ça'  string1, 'ca'  string2 FROM DUAL UNION ALL
  5     SELECT 'Üwe'  string1, 'Uwe'  string2 FROM DUAL
  6  )
  7  SELECT string1, string2,
  8         UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
  9         jws (string1, string2) my_jws
 10    FROM strings
 11   ORDER  BY my_jws DESC
 12  /
 
STRING1   STRING2   ORACLE_JWS     MY_JWS
--------- --------- ---------- ----------
Géraldine Gerladine         89         82
Üwe       Uwe               77         78
Jérôme    Jerome            80         70
ça        ca                66         67

b) Your function, as actually defined, is not really deterministic
Connected to Oracle Database 10g Enterprise Edition Release 10.2.0.4.0 
Connected as mni
 
SQL> alter session set nls_comp = 'LINGUISTIC';
 
Session altered
 
SQL> ALTER SESSION SET NLS_SORT=BINARY_AI;
 
Session altered
 
SQL> 
SQL> WITH strings AS (
  2     SELECT 'Géraldine'  string1, 'Gerladine'  string2 FROM DUAL UNION ALL
  3     SELECT 'Jérôme'  string1, 'Jerome'  string2 FROM DUAL UNION ALL
  4     SELECT 'ça'  string1, 'ca'  string2 FROM DUAL UNION ALL
  5     SELECT 'Üwe'  string1, 'Uwe'  string2 FROM DUAL
  6  )
  7  SELECT string1, string2,
  8         UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
  9         jws (string1, string2) my_jws
 10    FROM strings
 11   ORDER  BY my_jws DESC
 12  /
 
STRING1   STRING2   ORACLE_JWS     MY_JWS
--------- --------- ---------- ----------
Jérôme    Jerome            80        100
Üwe       Uwe               77        100
ça        ca                66        100
Géraldine Gerladine         89         97
Re: Jaro-Winkler similarity for 9i [message #486616 is a reply to message #486609] Fri, 17 December 2010 06:47 Go to previous messageGo to next message
Barbara Boehmer
Messages: 9090
Registered: November 2002
Location: California, USA
Senior Member
I had not considered accents, since there aren't any in English, and the Jaro-Winkler algorithm was created for name comparisons in English in the U.S.

I used the following as a guide for creating the algorithm:

http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance

and added in the test data from the following:

http://download.oracle.com/docs/cd/E11882_01/appdev.112/e16760/u_match.htm#CHDJAIJD

I also tested null values and modified the function to handle them the way that Oracle does. I can only guess at what additional things Oracle may have done in their compiled code that I can't see. I am not sure if the requestor wanted something to match Oracle's function or something that does a proper Jaro-Winkler comparison.

The different sort algorithms have an interesting effect. It causes my function to either ignore the accents or treat the accented characters as entirely different characters. However, it looks like Oracle's function treated them differently regardless of the sort, but not weighted the same as separate characters, but weighted some much higher and some slightly lower. I suppose one could dynamically alter the sort within the function to force it to be deterministic, but that takes away some flexibility from the user.




Re: Jaro-Winkler similarity for 9i [message #486640 is a reply to message #486616] Fri, 17 December 2010 10:43 Go to previous messageGo to next message
Michel Cadot
Messages: 68647
Registered: March 2007
Location: Nanterre, France, http://...
Senior Member
Account Moderator
Maybe a solution to this accented characters is to first convert them to their non-accented equivalent.
I have a TRANSLATE expression I used for a close thing:
translate (val,
           'ƒŠŒŽšœžŸÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïøðñòóôõöùúûüýÿÞþ',
           'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy')


Regards
Michel

[Updated on: Fri, 17 December 2010 10:45]

Report message to a moderator

Re: Jaro-Winkler similarity for 9i [message #486647 is a reply to message #486640] Fri, 17 December 2010 12:27 Go to previous messageGo to next message
Barbara Boehmer
Messages: 9090
Registered: November 2002
Location: California, USA
Senior Member
Michel Cadot wrote on Fri, 17 December 2010 08:43

Maybe a solution to this accented characters is to first convert them to their non-accented equivalent.
I have a TRANSLATE expression I used for a close thing:
translate (val,
           'ƒŠŒŽšœžŸÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïøðñòóôõöùúûüýÿÞþ',
           'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy')


Regards
Michel


That would work if you want to ignore the accents. Translate could either be applied to the strings before they were passed to the function or used in the function, which would make it truly deterministic. If you wanted to match Oracle, I don't know what sort of weight algorithm they applied to the various accents. Realistically, I think it makes more sense to ignore the accents when comparing names, looking for duplicates. Based on your suggestion, I have provided a revised function and test below. I find it odd that Oracle's utl_match.jaro_winkler_similarity considers "ça" and "ca" a complete non-match, instead of an exact match or even partial match. Also, this is not the results that mnitu got, perhaps due to a difference between his 10g and my 11g.

CREATE OR REPLACE FUNCTION jws -- Jaro-Winkler similarity
  (p_string1     IN VARCHAR2,
   p_string2     IN VARCHAR2)
  RETURN            NUMBER
  DETERMINISTIC
AS
  v_string1         VARCHAR2 (32767);
  v_string2         VARCHAR2 (32767);
  v_closeness       NUMBER := 0;
  v_temp            VARCHAR2 (32767);
  v_comp1           VARCHAR2 (32767);
  v_comp2           VARCHAR2 (32767);
  v_matches         NUMBER := 0; 
  v_char            VARCHAR2 (1);
  v_transpositions  NUMBER := 0;
  v_d_jaro          NUMBER := 0;
  v_leading         NUMBER := 0;
  v_d_winkler       NUMBER := 0;
  v_jws             NUMBER := 0;
BEGIN
  -- check for null strings:
  IF p_string1 IS NULL OR p_string2 IS NULL THEN 
    RETURN 0;
  END IF;
  -- remove accents:
  v_string1 := translate (p_string1,
            'ƒŠŒŽšœžŸÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïøðñòóôõöùúûüýÿÞþ',
            'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy');
  v_string2 := translate (p_string2,
            'ƒŠŒŽšœžŸÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝßàáâãäåæçèéêëìíîïøðñòóôõöùúûüýÿÞþ',
            'fSEZsezYAAAAAAECEEEEIIIIDNOOOOOOUUUUYBaaaaaaeceeeeiiiioonooooouuuuyy');
  -- closeness:
  v_closeness := (GREATEST (LENGTH (v_string1), LENGTH (v_string2)) / 2) - 1;
  -- find matching characters and transpositions within closeness:
  v_temp := v_string2;
  FOR i IN 1 .. LENGTH (v_string1) LOOP
    IF INSTR (v_temp, SUBSTR (v_string1, i, 1)) > 0 THEN
      v_char := SUBSTR (v_string1, i, 1);
      IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN
        v_comp1 := v_comp1 || SUBSTR (v_string1, i, 1);
        v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string1, i, 1)) - 1)
               || SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string1, i, 1)) + 1);
      END IF;
    END IF;    
  END LOOP;
  v_temp := v_string1;
  FOR i IN 1 .. LENGTH (v_string2) LOOP
    IF INSTR (v_temp, SUBSTR (v_string2, i, 1)) > 0 THEN
      v_char := SUBSTR (v_string2, i, 1);
      IF ABS (INSTR (v_string2, v_char) - INSTR (v_string1, v_char)) <= v_closeness THEN
        v_comp2 := v_comp2 || SUBSTR (v_string2, i, 1);
        v_temp := SUBSTR (v_temp, 1, INSTR (v_temp, SUBSTR (v_string2, i, 1)) - 1)
               || SUBSTR (v_temp, INSTR (v_temp, SUBSTR (v_string2, i, 1)) + 1);
      END IF;
    END IF;    
  END LOOP;
  -- check for null strings:
  IF v_comp1 IS NULL OR v_comp2 IS NULL THEN 
    RETURN 0;
  END IF;
  -- count matches and transpositions within closeness:
  FOR i IN 1 .. LEAST (LENGTH (v_comp1), LENGTH (v_comp2)) LOOP
    IF SUBSTR (v_comp1, i, 1) = SUBSTR (v_comp2, i, 1) THEN
      v_matches := v_matches + 1;
    ELSE
      v_char := SUBSTR (v_comp1, i, 1);
      IF ABS (INSTR (v_string1, v_char) - INSTR (v_string2, v_char)) <= v_closeness THEN
        v_transpositions := v_transpositions + 1;
        v_matches := v_matches + 1;
      END IF; 
    END IF;
  END LOOP;
  v_transpositions := v_transpositions / 2;
  -- check for no matches:
  IF v_matches = 0
    THEN RETURN 0;
  END IF;
  -- Jaro:
  v_d_jaro := ((v_matches / LENGTH (v_string1)) + 
               (v_matches / LENGTH (v_string2)) +
               ((v_matches - v_transpositions) / v_matches)) 
               / 3;
  -- count matching leading characters (up to 4):
  FOR i IN 1 .. LEAST (LENGTH (v_string1), LENGTH (v_string2), 4) LOOP
    IF SUBSTR (v_string1, i, 1) = SUBSTR (v_string2, i, 1) THEN
      v_leading := v_leading + 1;
    ELSE
      EXIT;
    END IF;
  END LOOP;
  -- Winkler:
  v_d_winkler := v_d_jaro + ((v_leading * .1) * (1 - v_d_jaro));
  -- Jaro-Winkler similarity rounded:
  v_jws := ROUND (v_d_winkler * 100);
  RETURN v_jws;
END jws;
/


SCOTT@orcl_11gR2> WITH
  2    strings AS
  3  	 (SELECT NULL	      string1, NULL	    string2 FROM DUAL UNION ALL
  4  	  SELECT 'test'       string1, NULL	    string2 FROM DUAL UNION ALL
  5  	  SELECT NULL	      string1, 'test'	    string2 FROM DUAL UNION ALL
  6  	  SELECT 'CRATE'      string1, 'TRACE'	    string2 FROM DUAL UNION ALL
  7  	  SELECT 'MARTHA'     string1, 'MARHTA'     string2 FROM DUAL UNION ALL
  8  	  SELECT 'DWAYNE'     string1, 'DUANE'	    string2 FROM DUAL UNION ALL
  9  	  SELECT 'DIXON'      string1, 'DICKSONX'   string2 FROM DUAL UNION ALL
 10  	  SELECT 'Dunningham' string1, 'Cunningham' string2 FROM DUAL UNION ALL
 11  	  SELECT 'Abroms'     string1, 'Abrams'     string2 FROM DUAL UNION ALL
 12  	  SELECT 'Lampley'    string1, 'Campley'    string2 FROM DUAL UNION ALL
 13  	  SELECT 'Jonathon'   string1, 'Jonathan'   string2 FROM DUAL UNION ALL
 14  	  SELECT 'Jeraldine'  string1, 'Gerladine'  string2 FROM DUAL UNION ALL
 15  	  SELECT 'test'       string1, 'blank'	    string2 FROM DUAL UNION ALL
 16  	  SELECT 'everybody'  string1, 'every'	    string2 FROM DUAL UNION ALL
 17  	  SELECT 'a'	      string1, 'aaa'	    string2 FROM DUAL UNION ALL
 18  	  SELECT 'Géraldine'  string1, 'Gerladine'  string2 FROM DUAL UNION ALL
 19  	  SELECT 'Jérôme'     string1, 'Jerome'     string2 FROM DUAL UNION ALL
 20  	  SELECT 'ça'	      string1, 'ca'	    string2 FROM DUAL UNION ALL
 21  	  SELECT 'Üwe'	      string1, 'Uwe'	    string2 FROM DUAL)
 22  SELECT string1, string2,
 23  	    UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
 24  	    jws (string1, string2) my_jws
 25  FROM   strings
 26  ORDER  BY my_jws DESC
 27  /

STRING1    STRING2    ORACLE_JWS     MY_JWS
---------- ---------- ---------- ----------
ça         ca                  0        100
Jérôme     Jerome             74        100
Üwe        Uwe                72        100
Géraldine  Gerladine          86         97
MARTHA     MARHTA             96         96
Jonathon   Jonathan           95         95
Dunningham Cunningham         93         93
Abroms     Abrams             92         92
everybody  every              91         91
Lampley    Campley            90         90
Jeraldine  Gerladine          88         88
DWAYNE     DUANE              84         84
DIXON      DICKSONX           81         81
a          aaa                80         80
CRATE      TRACE              73         73
test       blank               0          0
           test                0          0
test                           0          0
                               0          0

19 rows selected.

SCOTT@orcl_11gR2>







Re: Jaro-Winkler similarity for 9i [message #689035 is a reply to message #486569] Mon, 28 August 2023 13:31 Go to previous messageGo to next message
mathguy
Messages: 107
Registered: January 2023
Senior Member
Only 13 years later - I hope it's still of interest Smile

A recent discussion on AskTom made me write my own Jaro-Winkler similarity function. Testing against Oracle's version I found that the Oracle implementation is (almost surely) buggy. I discuss it in a reply titled Is there a bug in the Oracle implementation of Jaro-Winkler? in this thread: https://asktom.oracle.com/pls/apex/asktom.search?tag=performace-issue-with-sql-in-the-view-using-utl-matchjaro-winkler-similarity& ;p_session=9590653204898

At least some if not all of the discrepancies you saw between your values and Oracle's results are caused by what I reported there: Oracle has a funny way of calculating "transpositions". I don't know exactly what they do; the errors only seem to exist when the initial count is odd, and then must be divided by 2. In some cases it looks like Oracle truncates the result (which is incorrect - that can be caused either by a misunderstanding of the Jaro definition, or by careless programming in C, dividing by 2 instead of 2.0). But in other cases the value is rounded up - so either there are multiple bugs, or there is only one but it is more complex than simple "integer division", whether accidental or intended.

I found your thread here on OraFAQ doing a search for "Jaro-Winkler accented characters" since I found that my function differs from Oracle's for these too. I can see in a very simple example what Oracle is doing: when computing string length, it counts an accented letter as TWO characters. The Oracle function is written in C; I assume it counts bytes, not characters. Perhaps the Oracle function is only meant to be used for single-byte characters - but if that was the intent, there is no indication of it in the documentation.

I will wait for Chris Saxon to respond on AskTom - then I might write something on the Oracle forum, discussing other aspects of the question from the AskTom thread. I will show my version of the JW function, and also how it could be used in a materialized view (as discussed on AskTom, the Oracle function can't be used in a fast refreshing MV, because somehow it "maintains state" - probably a consequence of using package constants that are not declared as constants.)
Re: Jaro-Winkler similarity for 9i [message #689036 is a reply to message #689035] Tue, 29 August 2023 03:26 Go to previous messageGo to next message
Barbara Boehmer
Messages: 9090
Registered: November 2002
Location: California, USA
Senior Member
There is a lot of interesting stuff in the asktom thread about guessing what is in Oracle's hidden code that causes it to return seemingly wrong results and maintain a state such that a materialized view cannot be fast refreshed on commit.

The original question was about improving the performance of a view that uses jaro_winkler_similarity.  I posted a comment and demonstration of using a context index with fuzzy search that uses the domain index to narrow the search results.  The jaro_winkler_similarity can them be used on top of that.  The scores of both are adjustable.

A long time ago, I wrote my own edit distance function before Oracle had a built-in edit distance.  Even after Oracle supplied an edit distance and edit distance similarity, I found myself making modifications to my original, like using the Damerau-Levenshtein distance formula, in which the switching of two adjacent characters is counted as 1 change instead of 2 replacements.  I have provided my old code and an example below.


CREATE OR REPLACE FUNCTION dld -- Damerau–Levenshtein distance
  (str1              VARCHAR2 DEFAULT NULL,
   str2              VARCHAR2 DEFAULT NULL)
  RETURN NUMBER DETERMINISTIC
AS
  lenStr1            NUMBER := NVL (LENGTH (str1), 0);
  lenStr2            NUMBER := NVL (LENGTH (str2), 0);
  TYPE mytabtype IS  TABLE OF NUMBER INDEX BY BINARY_INTEGER;
  TYPE myarray IS    TABLE OF mytabtype INDEX BY BINARY_INTEGER;
  d                  myarray;
  cost               NUMBER := 0;
BEGIN
  IF str1 = str2 THEN
    RETURN 0;
  ELSIF lenStr1 = 0 OR lenStr2 = 0 THEN
    RETURN GREATEST (lenStr1, lenStr2);
  ELSIF lenStr1 = 1 AND lenStr2 = 1 AND str1 <> str2 THEN
    RETURN 1;
  ELSE
    FOR j in 0 .. lenStr2 LOOP
      d (0) (j) := j;
    END LOOP;
    FOR i in 1 .. lenStr1 LOOP
      d (i) (0) := i;
      FOR j in 1 .. lenStr2 LOOP
        IF SUBSTR (str1, i, 1) = SUBSTR (str2, j, 1) THEN
          cost := 0;
        ELSE
          cost := 1;
        END IF;
        d (i) (j) := LEAST (
                              d (i-1) (j)   + 1,      -- deletion
                              d (i)   (j-1) + 1,      -- insertion
                              d (i-1) (j-1) + cost    -- substitution
                           );
        IF i > 1 AND j > 1
           AND SUBSTR (str1, i,   1) = SUBSTR (str2, j-1, 1)
           AND SUBSTR (str1, i-1, 1) = SUBSTR (str2, j,   1)
        THEN
          d (i) (j) := LEAST (
                                d (i)   (j),      
                                d (i-2) (J-2) + cost  -- transposition
                             );
        END IF;              
      END LOOP;
    END LOOP;
    RETURN d (lenStr1) (lenStr2);
  END IF;
END dld;
/


C##SCOTT@XE_21.3.0.0.0> SELECT UTL_MATCH.EDIT_DISTANCE ('lion', 'loin'),
  2         dld ('lion', 'loin')
  3  FROM   DUAL
  4  /

UTL_MATCH.EDIT_DISTANCE('LION','LOIN') DLD('LION','LOIN')
-------------------------------------- ------------------
                                     2                  1

1 row selected.

[Updated on: Tue, 29 August 2023 03:54]

Report message to a moderator

Re: Jaro-Winkler similarity for 9i [message #689057 is a reply to message #486569] Sat, 02 September 2023 13:07 Go to previous messageGo to next message
mathguy
Messages: 107
Registered: January 2023
Senior Member
Hi Barbara,

I took a look at your function from 2010, and unfortunately it has a bug. I found it by comparing my results to yours, seeing that they disagree in some cases, and then I read your code and found the problem.

Let me illustrate, and then I will explain the bug in detail.

select str1, str2,
       utl_match.jaro_winkler_similarity(str1, str2) as orcl_jws,
       jws(str1,str2) as bb_jws
from   (select 'ab' as str1, 'bb'as str2 from dual)
;

STR1 STR2   ORCL_JWS     BB_JWS
---- ---- ---------- ----------
ab   bb           66          0
I am comparing two strings, 'ab' and 'bb'. Both have length 2; the "match distance" is trunc( greatest(2, 2) / 2) - 1 which equals 0. That is, characters match only if they are in the same position in both strings.

The letter 'b' in second position in both strings is a match. There are no transpositions (there can't be any, when there is exactly one match). The first character is different, so there is no Winkler correction to the Jaro calculation.

The Jaro similarity is calculated as follows: both lengths are 2, there is one match, and no transpositions; so...

(1/2 + 1/2 + (1-0)/1) / 3 = 2/3 or 0.66666...

or, "normalized" to a score between 1 and 100, it's 66.66666, truncated to 66. Oracle's calculation is correct in this case.

Your calculation shows 0, because the part of the code that identifies matches doesn't follow the correct algorithm.

The Jaro algorithm is like this: First, since we will need to keep track of characters already found in prior matches, we will need to flag characters as "matched" or "unmatched". Initially all characters are marked "unmatched". Then: for each character in the first string, the first step is to identify the "matching window" - from P - M to P + M, where P is the position of the character in the first string and M is the maximum matching distance (adjusted so that P - M is at least 0, and P + M is at most the length of the second string). THEN - and ONLY THEN - we look for the first unmarked character in the "matching window", that equals the character at position P in the first string.

Your function does something different: It takes a character from the first string, then it looks for the first "unmatched" character in the second string, that equals the character from the first string (if any), and ONLY THEN it checks to see if it is in the "matching window".

The fact that the two methods are not equivalent is illustrated by the example I gave. When comparing 'ab' with 'bb', the 'a' in the first string doesn't match anything in the second string. Then the 'b' in the first string is considered.

In the correct Jaro computation, the maximum "matching distance" is 0. The window to search in the second string is just the same position as in the first string. So we look at the character in second position in the second string. It's 'b', which is a match.

In your approach, when we inspect the second character of 'ab' (the character 'b'), there is nothing marked "matched" in the second string yet. We find the first matching character in the second string, WITH NO REGARD TO THE MATCHING WINDOW. That character is the FIRST 'b' in the second string. Then we compare positions and find that this isn't a match because the position in the second string doesn't fall in the "matching window". At this point your function doesn't look for another match, that may be IN the matching window. So it counts no matches at all between the two strings.

Your function does other things I didn't quite understand (and I think aren't right), but I didn't look in detail. You don't seem to keep track of which characters were already "used up" in earlier matches, in a SINGLE loop, over the first string - as the Jaro definition requires. Instead, you find all the "matches" (according to your definition) for the first string, and then separately all the "matches" for the second string. This has no equivalent in the Jaro definition. In particular, this is not equivalent to finding the matching PAIRS, on which transpositions are calculated.

Your function will give the correct results, I believe, if the strings don't have repeated characters, or if there are repeated characters but they are in the same order compared to other matching pairs. It will fail on examples like 'abca' and 'bcaa' (it does fail in that example, where the Oracle result is correct):

select str1, str2,
       utl_match.jaro_winkler_similarity(str1, str2) as orcl_jws,
       jws(str1,str2) as bb_jws
from   (select 'abca' as str1, 'bcaa'as str2 from dual)
;

STR1 STR2   ORCL_JWS     BB_JWS
---- ---- ---------- ----------
abca bcaa         83         67
AS AN ASIDE (unrelated in any way to the bug):

Consider the case of strings of length 1. Like 'a' and 'a'. Any normal-brained person would consider those as a "perfect match", with a similarity score of 1.000 (or "100"). However, a strict interpretation of the Jaro definition should find no match! That is because GREATEST(LENGTH_1, LENGTH_2) is 1; then TRUNC(1/2) - 1 equals -1, so the "matching window" is always empty! Indeed, the "distance" between two positions is never "less than or equal to -1" (it is always non-negative).

Oracle, when they implemented JW in the UTL_MATCH function, decided to ignore the Jaro definition in this respect; they calculate 100 as the JW similarity. Your function calculates 0 - but I don't consider that as a bug in your code. Rather, it is a bug in Jaro's own definition. The "maximum matching distance" should be defined as GREATEST(0, JARO_DEFINITION) where the latter is Jaro's definition, in order to work as expected for this special case.

Of course, in real life people would rarely consider the "similarity" of one-character strings - although if we are matching addresses component by component and something like "apartment" can be '3' or '5', then we do consider one-character strings. By contrast, the bug I described earlier seems more general.
Re: Jaro-Winkler similarity for 9i [message #689058 is a reply to message #689057] Sat, 02 September 2023 14:02 Go to previous messageGo to next message
Barbara Boehmer
Messages: 9090
Registered: November 2002
Location: California, USA
Senior Member
Hi mathguy,

I stumbled upon the following that I think you will find interesting and may shed some light on the discrepancies between Oracle's function and the standard definition that seem like they might be bugs.

https://docs.oracle.com/cd/E19182-01/821-0919/gfjtr/index.html

There are multiple links to other pages and links within those pages.

It was a lot of years ago that I created that function and I remember it took me a long time to think through it.

At this point, I believe:

The formula that Oracle uses is not visible and the name may be misleading as it actually incorporates other methods.

Oracle's function may or may not have bugs.

My function may or may not have bugs and at this point I would not know what to compare it to, in order to fix it.  I don't feel like messing with it, but if you want to fix mine or create your own, please go ahead and share the result.

I miss the old days when asktom meant ask Tom Kyte and not ask the Oracle masters.  Tom used to get to the bottom of things and either explain in detail and demonstrate or file a bug report.

Regards,
Barbara

[Updated on: Sat, 02 September 2023 14:05]

Report message to a moderator

Re: Jaro-Winkler similarity for 9i [message #689059 is a reply to message #689057] Sat, 02 September 2023 14:22 Go to previous messageGo to next message
mathguy
Messages: 107
Registered: January 2023
Senior Member
For what it's worth, here is my implementation of the Jaro-Winkler similarity function.

UTL_MATCH has a "Jaro-Winkler" function and a "Jaro-Winkler similarity" function. The latter takes the return value from the former, it multiplies by 100 and truncates the result. This is NOT the theoretical definition of the "similarity" function, it's Oracle's terminology. The theoretical similarity function is the FIRST Oracle function, the one that returns a decimal value between 0 and 1 (not an integer between 1 and 100). My function computes "Jaro-Winkler similarity" as a decimal value between 0 and 1, and should be compared to UTL_MATCH.JARO_WINKLER (not JARO_WINKLER_SIMILARITY).

In my implementation, I define "maximum matching distance" as GREATEST(JMD, 0) where JMD is the "maximum matching distance" defined by Jaro himself. This is to correct for a mistake (I believe) in Jaro's definition; namely, when both strings have length 1, Jaro's definition of "maximum matching distance" is -1, which makes no sense. The similarity of a one-character string with itself should be 1, not 0. Oracle did the same in their implementation.

Note about NULL: Oracle views NULL and the empty string as the same, so we must make a choice. In a sane system (which Oracle DB is not), the similarity function should return NULL if either string is NULL, it should return 1 if both strings are the empty string (seen as different from NULL), and it should return 0 if one string is the empty string and the other is a non-NULL, non-empty string. Here I followed the UTL_MATCH lead: if either string is NULL (or "empty"), the function returns 0. It doesn't return NULL in that case; and it doesn't return 1 even if both strings are NULL ("empty").

The code is written for - and tested in - Oracle 12; for example I use PRAGMA UDF (to optimize the function for being called from SQL), which didn't exist before Oracle 12. However, it can be easily modified to work for earlier versions.

For the Winkler adjustment, I hard-coded the maximum prefix length as 4 and the prefix scaling factor as 0.1 (as in the original Winkler paper, and as Oracle has done). These can be added as function parameters - and the code modified as needed - if it matters. If that is done, error handling must be added to check that the product of the two is less than or equal to 1.

The function works for ASCII text - and, moreover, I only tested on my system, where the character set and encoding uses single bytes for ASCII characters. I don't know how (or even if) this would work with UTF-16 encoding, for example. This has to do, in particular, with accented characters and such.

Here goes:

create or replace function jw (str1 varchar2, str2 varchar2)
  return number
  authid definer
  deterministic
as
  pragma udf;

  -- js will hold the Jaro similarity; d will hold the return value
  js number;
  d  number := 0;

  -- variables to hold matches, transpositions, and init agreement
  -- my "transpositions" number is the actual count, not divided by 2;
  -- I divide by 2 in the final formula
  matches        number;
  transpositions number;
  init_agreement number;

  -- We will use two arrays of boolean to track matching
  type arr_of_boolean is varray(4000) of boolean;
  b1 arr_of_boolean;
  b2 arr_of_boolean;

  -- len1 and len2 will hold the length of the two input strings
  -- lenm is the greater of the two lengths
  -- lenw is used for the Winkler correction
  -- max_distance is the maximum distance for matching in the Jaro definition
  len1 number;
  len2 number;
  lenm number;
  lenw number;
  max_distance number;

  -- r_start and r_end mark the allowed range of matches in str2
  r_start number;
  r_end   number;

  -- loop variables
  k number;
begin
  -- if str1 or str2 is null, the function will return 0
  if str1 is not null and str2 is not null then

    -- collect the various length and related values
    len1 := length(str1);
    len2 := length(str2);
    lenm := greatest(len1, len2);
    lenw := least(len1, len2, 4);

    -- max distance for matches (watch out for lenm = 1!) 
    -- THIS IS DIFFERENT FROM JARO DEFINITION by adding GREATEST(..., 0)
    max_distance := greatest(trunc(lenm/2) - 1, 0);

    -- define the arrays of booleans
    -- elements are initialized with null, which is OK for us
    -- a value changes to TRUE when the corresponding character appears in a match 
    b1   := arr_of_boolean();
    b2   := arr_of_boolean();
    b1.extend(len1);
    b2.extend(len2);

    -- find the matches
    matches := 0;
    for i in 1 .. len1 loop
      r_start := greatest(1, i - max_distance);
      r_end   := least(len2, i + max_distance);
      for k in r_start .. r_end loop
        if b2(k) then
          continue;
        end if;
        if substr(str1, i, 1) != substr(str2, k, 1) then
          continue;
        end if;
        b1(i) := true;
        b2(k) := true;
        matches := matches + 1;
        exit;
      end loop;
    end loop;

    -- Continue only if matches > 0; else d = 0 will be returned

    if matches > 0 then
      -- Compute transpositions
      transpositions := 0;
      k := 1;
      for i in 1 .. len1 loop
        if b1(i) is null then
          continue;
        end if;
        while b2(k) is null loop
          k := k + 1;
        end loop;
        if substr(str1, i, 1) != substr(str2, k, 1) then
          transpositions := transpositions + 1;
        end if;
        k := k + 1;
      end loop;

      -- Compute Jaro similarity
      js := (matches * (1/len1 + 1/len2) + 1 - transpositions/(2 * matches)) / 3;

      -- Compute initial agreement for Winkler correction
      init_agreement := 0;
      k := 1;
      while k <= lenw and substr(str1, k, 1) = substr(str2, k, 1) loop
        init_agreement := init_agreement + 1;
        k := k + 1;
      end loop;

      -- Compute Jaro-Winkler similarity
      -- Assumes l = 4 and p = 0.1 as in Winkler's original paper
      d := js + init_agreement * 0.1 * (1 - js);
    end if;
  end if;

  return d;
end;
Re: Jaro-Winkler similarity for 9i [message #689060 is a reply to message #689059] Sat, 02 September 2023 15:09 Go to previous message
Barbara Boehmer
Messages: 9090
Registered: November 2002
Location: California, USA
Senior Member
Here is my test, removing the diacritical marks and multiplying your result by 100 and rounding it for comparison.  It seems to perfectly match Oracle, so if that was the goal then you succeeded, at least with this original dataset.


C##SCOTT@XE_21.3.0.0.0> create or replace function jw (str1 varchar2, str2 varchar2)
  2    -- author of function:  mathguy
  3    -- from OraFAQ thread:  http://www.orafaq.com/forum/mv/msg/164224/689059/#msg_689059
  4    return number
  5    authid definer
  6    deterministic
  7  as
  8    pragma udf;
  9  
 10    -- js will hold the Jaro similarity; d will hold the return value
 11    js number;
 12    d  number := 0;
 13  
 14    -- variables to hold matches, transpositions, and init agreement
 15    -- my "transpositions" number is the actual count, not divided by 2;
 16    -- I divide by 2 in the final formula
 17    matches	      number;
 18    transpositions number;
 19    init_agreement number;
 20  
 21    -- We will use two arrays of boolean to track matching
 22    type arr_of_boolean is varray(4000) of boolean;
 23    b1 arr_of_boolean;
 24    b2 arr_of_boolean;
 25  
 26    -- len1 and len2 will hold the length of the two input strings
 27    -- lenm is the greater of the two lengths
 28    -- lenw is used for the Winkler correction
 29    -- max_distance is the maximum distance for matching in the Jaro definition
 30    len1 number;
 31    len2 number;
 32    lenm number;
 33    lenw number;
 34    max_distance number;
 35  
 36    -- r_start and r_end mark the allowed range of matches in str2
 37    r_start number;
 38    r_end   number;
 39  
 40    -- loop variables
 41    k number;
 42  begin
 43    -- if str1 or str2 is null, the function will return 0
 44    if str1 is not null and str2 is not null then
 45  
 46  	 -- collect the various length and related values
 47  	 len1 := length(str1);
 48  	 len2 := length(str2);
 49  	 lenm := greatest(len1, len2);
 50  	 lenw := least(len1, len2, 4);
 51  
 52  	 -- max distance for matches (watch out for lenm = 1!)
 53  	 -- THIS IS DIFFERENT FROM JARO DEFINITION by adding GREATEST(..., 0)
 54  	 max_distance := greatest(trunc(lenm/2) - 1, 0);
 55  
 56  	 -- define the arrays of booleans
 57  	 -- elements are initialized with null, which is OK for us
 58  	 -- a value changes to TRUE when the corresponding character appears in a match
 59  	 b1   := arr_of_boolean();
 60  	 b2   := arr_of_boolean();
 61  	 b1.extend(len1);
 62  	 b2.extend(len2);
 63  
 64  	 -- find the matches
 65  	 matches := 0;
 66  	 for i in 1 .. len1 loop
 67  	   r_start := greatest(1, i - max_distance);
 68  	   r_end   := least(len2, i + max_distance);
 69  	   for k in r_start .. r_end loop
 70  	     if b2(k) then
 71  	       continue;
 72  	     end if;
 73  	     if substr(str1, i, 1) != substr(str2, k, 1) then
 74  	       continue;
 75  	     end if;
 76  	     b1(i) := true;
 77  	     b2(k) := true;
 78  	     matches := matches + 1;
 79  	     exit;
 80  	   end loop;
 81  	 end loop;
 82  
 83  	 -- Continue only if matches > 0; else d = 0 will be returned
 84  
 85  	 if matches > 0 then
 86  	   -- Compute transpositions
 87  	   transpositions := 0;
 88  	   k := 1;
 89  	   for i in 1 .. len1 loop
 90  	     if b1(i) is null then
 91  	       continue;
 92  	     end if;
 93  	     while b2(k) is null loop
 94  	       k := k + 1;
 95  	     end loop;
 96  	     if substr(str1, i, 1) != substr(str2, k, 1) then
 97  	       transpositions := transpositions + 1;
 98  	     end if;
 99  	     k := k + 1;
100  	   end loop;
101  
102  	   -- Compute Jaro similarity
103  	   js := (matches * (1/len1 + 1/len2) + 1 - transpositions/(2 * matches)) / 3;
104  
105  	   -- Compute initial agreement for Winkler correction
106  	   init_agreement := 0;
107  	   k := 1;
108  	   while k <= lenw and substr(str1, k, 1) = substr(str2, k, 1) loop
109  	     init_agreement := init_agreement + 1;
110  	     k := k + 1;
111  	   end loop;
112  
113  	   -- Compute Jaro-Winkler similarity
114  	   -- Assumes l = 4 and p = 0.1 as in Winkler's original paper
115  	   d := js + init_agreement * 0.1 * (1 - js);
116  	 end if;
117    end if;
118  
119    return d;
120  end;
121  /

Function created.

C##SCOTT@XE_21.3.0.0.0> SHOW ERRORS
No errors.
C##SCOTT@XE_21.3.0.0.0> WITH
  2    strings AS
  3  	 (SELECT NULL	      string1, NULL	    string2 FROM DUAL UNION ALL
  4  	  SELECT 'test'       string1, NULL	    string2 FROM DUAL UNION ALL
  5  	  SELECT NULL	      string1, 'test'	    string2 FROM DUAL UNION ALL
  6  	  SELECT 'CRATE'      string1, 'TRACE'	    string2 FROM DUAL UNION ALL
  7  	  SELECT 'MARTHA'     string1, 'MARHTA'     string2 FROM DUAL UNION ALL
  8  	  SELECT 'DWAYNE'     string1, 'DUANE'	    string2 FROM DUAL UNION ALL
  9  	  SELECT 'DIXON'      string1, 'DICKSONX'   string2 FROM DUAL UNION ALL
 10  	  SELECT 'Dunningham' string1, 'Cunningham' string2 FROM DUAL UNION ALL
 11  	  SELECT 'Abroms'     string1, 'Abrams'     string2 FROM DUAL UNION ALL
 12  	  SELECT 'Lampley'    string1, 'Campley'    string2 FROM DUAL UNION ALL
 13  	  SELECT 'Jonathon'   string1, 'Jonathan'   string2 FROM DUAL UNION ALL
 14  	  SELECT 'Jeraldine'  string1, 'Gerladine'  string2 FROM DUAL UNION ALL
 15  	  SELECT 'test'       string1, 'blank'	    string2 FROM DUAL UNION ALL
 16  	  SELECT 'everybody'  string1, 'every'	    string2 FROM DUAL UNION ALL
 17  	  SELECT 'a'	      string1, 'aaa'	    string2 FROM DUAL UNION ALL
 18  	  SELECT 'Geraldine'  string1, 'Gerladine'  string2 FROM DUAL UNION ALL
 19  	  SELECT 'Jerome'     string1, 'Jerome'     string2 FROM DUAL UNION ALL
 20  	  SELECT 'ca'	      string1, 'ca'	    string2 FROM DUAL UNION ALL
 21  	  SELECT 'Uwe'	      string1, 'Uwe'	    string2 FROM DUAL)
 22  SELECT string1, string2,
 23  	    UTL_MATCH.JARO_WINKLER_SIMILARITY (string1, string2) oracle_jws,
 24  	    round (jw (string1, string2) * 100) mathguy_jws -- multiplied by 100 and rounded for comparison
 25  FROM   strings
 26  ORDER  BY mathguy_jws DESC
 27  /

STRING1    STRING2    ORACLE_JWS MATHGUY_JWS                                    
---------- ---------- ---------- -----------                                    
ca         ca                100         100                                    
Jerome     Jerome            100         100                                    
Uwe        Uwe               100         100                                    
Geraldine  Gerladine          97          97                                    
MARTHA     MARHTA             96          96                                    
Jonathon   Jonathan           95          95                                    
Dunningham Cunningham         93          93                                    
Abroms     Abrams             92          92                                    
everybody  every              91          91                                    
Lampley    Campley            90          90                                    
Jeraldine  Gerladine          88          88                                    
DWAYNE     DUANE              84          84                                    
DIXON      DICKSONX           81          81                                    
a          aaa                80          80                                    
CRATE      TRACE              73          73                                    
test       blank               0           0                                    
           test                0           0                                    
test                           0           0                                    
                               0           0                                    

19 rows selected.






Previous Topic: ora-01775 looping chain of synonyms in forms 6i (merged)
Next Topic: Consuming Rest API from PL/SQL using UTL_HTTP - body not recognized
Goto Forum:
  


Current Time: Sat Apr 27 16:41:24 CDT 2024