From 6132ed9e2a63fb53a3512139d3cbc830c69c595e Mon Sep 17 00:00:00 2001 From: Jihun Lee Date: Sun, 19 Sep 2021 15:00:03 +0900 Subject: [PATCH] Improve word tokenization for non-Latin characters --- src/diff/word.js | 66 ++++++++++++++++++++++++----------------------- test/diff/word.js | 1 + 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/diff/word.js b/src/diff/word.js index 6d8741de7..deee74b90 100644 --- a/src/diff/word.js +++ b/src/diff/word.js @@ -1,25 +1,19 @@ import Diff from './base'; import {generateOptions} from '../util/params'; -// Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode -// -// Ranges and exceptions: -// Latin-1 Supplement, 0080–00FF -// - U+00D7 × Multiplication sign -// - U+00F7 ÷ Division sign -// Latin Extended-A, 0100–017F -// Latin Extended-B, 0180–024F -// IPA Extensions, 0250–02AF -// Spacing Modifier Letters, 02B0–02FF -// - U+02C7 ˇ ˇ Caron -// - U+02D8 ˘ ˘ Breve -// - U+02D9 ˙ ˙ Dot Above -// - U+02DA ˚ ˚ Ring Above -// - U+02DB ˛ ˛ Ogonek -// - U+02DC ˜ ˜ Small Tilde -// - U+02DD ˝ ˝ Double Acute Accent -// Latin Extended Additional, 1E00–1EFF -const extendedWordChars = /^[a-zA-Z\u{C0}-\u{FF}\u{D8}-\u{F6}\u{F8}-\u{2C6}\u{2C8}-\u{2D7}\u{2DE}-\u{2FF}\u{1E00}-\u{1EFF}]+$/u; +const spaceChars = ' \f\t\v\u00a0\u1680\u2000-\u200a\u2028\u2029\u202f\u205f\u3000\ufeff'; +let charsCannotBecomeWord = ''; +charsCannotBecomeWord += '\n\r'; +charsCannotBecomeWord += + '\u0000-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E'; // Basic Latin + charsCannotBecomeWord += '\u00A0-\u00BF\u00D7\u00F7'; // Latin-1 Supplement + charsCannotBecomeWord += '\u02B9-\u02DD\u02E5-\u02FF'; // Spacing Modifier Letters + charsCannotBecomeWord += '\u0300-\u036F'; // Combining Diacritical Marks + charsCannotBecomeWord += '\u1000-\u1FAFF'; // Mahjong Tiles - Symbols and Pictographs Extended-A + charsCannotBecomeWord += '\u2000-\u2BFF'; // General Punctuation - Miscellaneous Symbols and Arrows + charsCannotBecomeWord += '\u3000-\u303F'; // CJK Symbols and Punctuation +const spaceRegExp = new RegExp(`[${spaceChars}]`); +const cannotBecomeWordRegExp = new RegExp(`[${charsCannotBecomeWord}]`); const reWhitespace = /\S/; @@ -32,21 +26,29 @@ wordDiff.equals = function(left, right) { return left === right || (this.options.ignoreWhitespace && !reWhitespace.test(left) && !reWhitespace.test(right)); }; wordDiff.tokenize = function(value) { - // All whitespace symbols except newline group into one token, each newline - in separate token - let tokens = value.split(/([^\S\r\n]+|[()[\]{}'"\r\n]|\b)/); - - // Join the boundary splits that we do not consider to be boundaries. This is primarily the extended Latin character set. - for (let i = 0; i < tokens.length - 1; i++) { - // If we have an empty string in the next field and we have only word chars before and after, merge - if (!tokens[i + 1] && tokens[i + 2] - && extendedWordChars.test(tokens[i]) - && extendedWordChars.test(tokens[i + 2])) { - tokens[i] += tokens[i + 2]; - tokens.splice(i + 1, 2); - i--; + const tokens = []; + let prevCharType = ''; + for (let i = 0; i < value.length; i++) { + const char = value[i]; + if (spaceRegExp.test(char)) { + if(prevCharType === 'space') { + tokens[tokens.length - 1] += ' '; + } else { + tokens.push(' '); + } + prevCharType = 'space'; + } else if (cannotBecomeWordRegExp.test(char)) { + tokens.push(char); + prevCharType = ''; + } else { + if(prevCharType === 'word') { + tokens[tokens.length - 1] += char; + } else { + tokens.push(char); + } + prevCharType = 'word'; } } - return tokens; }; diff --git a/test/diff/word.js b/test/diff/word.js index b1814d482..f7a96ddc1 100644 --- a/test/diff/word.js +++ b/test/diff/word.js @@ -64,6 +64,7 @@ describe('WordDiff', function() { it('should token unicode characters safely', function() { expect(wordDiff.removeEmpty(wordDiff.tokenize('jurídica'))).to.eql(['jurídica']); expect(wordDiff.removeEmpty(wordDiff.tokenize('wir üben'))).to.eql(['wir', ' ', 'üben']); + expect(wordDiff.removeEmpty(wordDiff.tokenize('안녕.'))).to.eql(['안녕', '.']); }); it('should include count with identity cases', function() {