From 6132ed9e2a63fb53a3512139d3cbc830c69c595e Mon Sep 17 00:00:00 2001
From: Jihun Lee <zeroion83@gmail.com>
Date: Sun, 19 Sep 2021 15:00:03 +0900
Subject: [PATCH] Improve word tokenization for non-Latin characters

---
 src/diff/word.js  | 66 ++++++++++++++++++++++++-----------------------
 test/diff/word.js |  1 +
 2 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/src/diff/word.js b/src/diff/word.js
index 6d8741de7..deee74b90 100644
--- a/src/diff/word.js
+++ b/src/diff/word.js
@@ -1,25 +1,19 @@
 import Diff from './base';
 import {generateOptions} from '../util/params';
 
-// Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode
-//
-// Ranges and exceptions:
-// Latin-1 Supplement, 0080–00FF
-//  - U+00D7  × Multiplication sign
-//  - U+00F7  ÷ Division sign
-// Latin Extended-A, 0100–017F
-// Latin Extended-B, 0180–024F
-// IPA Extensions, 0250–02AF
-// Spacing Modifier Letters, 02B0–02FF
-//  - U+02C7  ˇ &#711;  Caron
-//  - U+02D8  ˘ &#728;  Breve
-//  - U+02D9  ˙ &#729;  Dot Above
-//  - U+02DA  ˚ &#730;  Ring Above
-//  - U+02DB  ˛ &#731;  Ogonek
-//  - U+02DC  ˜ &#732;  Small Tilde
-//  - U+02DD  ˝ &#733;  Double Acute Accent
-// Latin Extended Additional, 1E00–1EFF
-const extendedWordChars = /^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u;
+const spaceChars = ' \f\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff';
+let charsCannotBecomeWord = '';
+charsCannotBecomeWord += '\n\r';
+charsCannotBecomeWord +=
+  '\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E'; // Basic Latin
+  charsCannotBecomeWord += '\u00A0-\u00BF\u00D7\u00F7'; // Latin-1 Supplement
+  charsCannotBecomeWord += '\u02B9-\u02DD\u02E5-\u02FF'; // Spacing Modifier Letters
+  charsCannotBecomeWord += '\u0300-\u036F'; // Combining Diacritical Marks
+  charsCannotBecomeWord += '\u1000-\u1FAFF'; // Mahjong Tiles - Symbols and Pictographs Extended-A
+  charsCannotBecomeWord += '\u2000-\u2BFF'; // General Punctuation - Miscellaneous Symbols and Arrows
+  charsCannotBecomeWord += '\u3000-\u303F'; // CJK Symbols and Punctuation
+const spaceRegExp = new RegExp(`[${spaceChars}]`);
+const cannotBecomeWordRegExp = new RegExp(`[${charsCannotBecomeWord}]`);
 
 const reWhitespace = /\S/;
 
@@ -32,21 +26,29 @@ wordDiff.equals = function(left, right) {
   return left === right || (this.options.ignoreWhitespace && !reWhitespace.test(left) && !reWhitespace.test(right));
 };
 wordDiff.tokenize = function(value) {
-  // All whitespace symbols except newline group into one token, each newline - in separate token
-  let tokens = value.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/);
-
-  // Join the boundary splits that we do not consider to be boundaries. This is primarily the extended Latin character set.
-  for (let i = 0; i < tokens.length - 1; i++) {
-    // If we have an empty string in the next field and we have only word chars before and after, merge
-    if (!tokens[i + 1] && tokens[i + 2]
-          && extendedWordChars.test(tokens[i])
-          && extendedWordChars.test(tokens[i + 2])) {
-      tokens[i] += tokens[i + 2];
-      tokens.splice(i + 1, 2);
-      i--;
+  const tokens = [];
+  let prevCharType = '';
+  for (let i = 0; i < value.length; i++) {
+    const char = value[i];
+    if (spaceRegExp.test(char)) {
+      if(prevCharType === 'space') {
+        tokens[tokens.length - 1] += ' ';
+      } else {
+        tokens.push(' ');
+      }
+      prevCharType = 'space';
+    } else if (cannotBecomeWordRegExp.test(char)) {
+      tokens.push(char);
+      prevCharType = '';
+    } else {
+      if(prevCharType === 'word') {
+        tokens[tokens.length - 1] += char;
+      } else {
+        tokens.push(char);
+      }
+      prevCharType = 'word';
     }
   }
-
   return tokens;
 };
 
diff --git a/test/diff/word.js b/test/diff/word.js
index b1814d482..f7a96ddc1 100644
--- a/test/diff/word.js
+++ b/test/diff/word.js
@@ -64,6 +64,7 @@ describe('WordDiff', function() {
     it('should token unicode characters safely', function() {
       expect(wordDiff.removeEmpty(wordDiff.tokenize('jurídica'))).to.eql(['jurídica']);
       expect(wordDiff.removeEmpty(wordDiff.tokenize('wir üben'))).to.eql(['wir', ' ', 'üben']);
+      expect(wordDiff.removeEmpty(wordDiff.tokenize('안녕.'))).to.eql(['안녕', '.']);
     });
 
     it('should include count with identity cases', function() {