Add Intl.Segmenter support (#539)

ExplodingCabbage · web-flow · commit 4f0430a30789 · 2024-08-01T13:30:29.000+01:00
* Add Intl.Segmenter support and some initial tests. (Missing docs, coverage, release notes.)

* Get to 100% coverage

* Document intlSegmenter

* Improve docs

* Add release notes
diff --git a/README.md b/README.md
@@ -39,6 +39,11 @@ Broadly, jsdiff's diff functions all take an old text and a new text and perform
 
     Options
     * `ignoreCase`: Same as in `diffChars`. Defaults to false.
+    * `intlSegmenter`: An optional [`Intl.Segmenter`](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter) object (which must have a `granularity` of `'word'`) for `diffWords` to use to split the text into words.
+
+      By default, `diffWords` does not use an `Intl.Segmenter`, just some regexes for splitting text into words. This will tend to give worse results than `Intl.Segmenter` would, but ensures the results are consistent across environments; `Intl.Segmenter` behaviour is only loosely specced and the implementations in browsers could in principle change dramatically in future. If you want to use `diffWords` with an `Intl.Segmenter` but ensure it behaves the same whatever environment you run it in, use an `Intl.Segmenter` polyfill instead of the JavaScript engine's native `Intl.Segmenter` implementation.
+
+      Using an `Intl.Segmenter` should allow better word-level diffing of non-English text than the default behaviour. For instance, `Intl.Segmenter`s can generally identify via built-in dictionaries which sequences of adjacent Chinese characters form words, allowing word-level diffing of Chinese. By specifying a language when instantiating the segmenter (e.g. `new Intl.Segmenter('sv', {granularity: 'word'})`) you can also support language-specific rules, like treating Swedish's colon separated contractions (like *k:a* for *kyrka*) as single words; by default this would be seen as two words separated by a colon.
 
 * `Diff.diffWordsWithSpace(oldStr, newStr[, options])` - diffs two blocks of text, treating each word, punctuation mark, newline, or run of (non-newline) whitespace as a token.
 
diff --git a/release-notes.md b/release-notes.md
@@ -34,6 +34,7 @@
   * The context line immediately before and immediately after an insertion must match exactly between the hunk and the file for a hunk to apply. (Previously this was not required.)
 - [#535](https://github.com/kpdecker/jsdiff/pull/535) **A bug in patch generation functions is now fixed** that would sometimes previously cause `\ No newline at end of file` to appear in the wrong place in the generated patch, resulting in the patch being invalid.
 - [#535](https://github.com/kpdecker/jsdiff/pull/535) **Passing `newlineIsToken: true` to *patch*-generation functions is no longer allowed.** (Passing it to `diffLines` is still supported - it's only functions like `createPatch` where passing `newlineIsToken` is now an error.) Allowing it to be passed never really made sense, since in cases where the option had any effect on the output at all, the effect tended to be causing a garbled patch to be created that couldn't actually be applied to the source file.
+- [#539](https://github.com/kpdecker/jsdiff/pull/539) **`diffWords` now takes an optional `intlSegmenter` option** which should be an `Intl.Segmenter` with word-level granularity. This provides better tokenization of text into words than the default behaviour, even for English but especially for some other languages for which the default behaviour is poor.
 
 ## v5.2.0
 
diff --git a/src/diff/word.js b/src/diff/word.js
@@ -58,8 +58,16 @@ wordDiff.equals = function(left, right, options) {
   return left.trim() === right.trim();
 };
 
-wordDiff.tokenize = function(value) {
-  let parts = value.match(tokenizeIncludingWhitespace) || [];
+wordDiff.tokenize = function(value, options = {}) {
+  let parts;
+  if (options.intlSegmenter) {
+    if (options.intlSegmenter.resolvedOptions().granularity != 'word') {
+      throw new Error('The segmenter passed must have a granularity of "word"');
+    }
+    parts = Array.from(options.intlSegmenter.segment(value), segment => segment.segment);
+  } else {
+    parts = value.match(tokenizeIncludingWhitespace) || [];
+  }
   const tokens = [];
   let prevPart = null;
   parts.forEach(part => {
diff --git a/test/diff/word.js b/test/diff/word.js
@@ -209,6 +209,55 @@ describe('WordDiff', function() {
       );
       expect(convertChangesToXML(diffResult)).to.equal('foo<del> </del><ins>\t</ins>bar');
     });
+
+    it('supports tokenizing with an Intl.Segmenter', () => {
+      // Example 1: Diffing Chinese text with no spaces.
+      // I am not a Chinese speaker but I believe these sentences to mean:
+      // 1. "I have (我有) many (很多) tables (桌子)"
+      // 2. "Mei (梅) has (有) many (很多) sons (儿子)"
+      // We want to see that diffWords will get the word counts right and won't try to treat the
+      // trailing 子 as common to both texts (since it's part of a different word each time).
+      // TODO: Check with a Chinese speaker that this example is correct Chinese.
+      const chineseSegmenter = new Intl.Segmenter('zh', {granularity: 'word'});
+      const diffResult = diffWords('我有很多桌子。', '梅有很多儿子。', {intlSegmenter: chineseSegmenter});
+      expect(diffResult).to.deep.equal([
+        { count: 1, added: false, removed: true, value: '我有' },
+        { count: 2, added: true, removed: false, value: '梅有' },
+        { count: 1, added: false, removed: false, value: '很多' },
+        { count: 1, added: false, removed: true, value: '桌子' },
+        { count: 1, added: true, removed: false, value: '儿子' },
+        { count: 1, added: false, removed: false, value: '。' }
+      ]);
+
+      // Example 2: Should understand that a colon in the middle of a word is not a word break in
+      // Finnish (see https://stackoverflow.com/a/76402021/1709587)
+      const finnishSegmenter = new Intl.Segmenter('fi', {granularity: 'word'});
+      expect(convertChangesToXML(diffWords(
+        'USA:n nykyinen presidentti',
+        'USA ja sen presidentti',
+        {intlSegmenter: finnishSegmenter}
+      ))).to.equal('<del>USA:n nykyinen</del><ins>USA ja sen</ins> presidentti');
+
+      // Example 3: Some English text, including contractions, long runs of arbitrary space,
+      // and punctuation, and using case insensitive mode, just to show all normal behaviour of
+      // diffWords still works with a segmenter
+      const englishSegmenter = new Intl.Segmenter('en', {granularity: 'word'});
+      expect(convertChangesToXML(diffWords(
+        "There wasn't time \n \t   for all that. He thought...",
+        "There isn't time \n \t   left for all that, he thinks.",
+        {intlSegmenter: englishSegmenter, ignoreCase: true}
+      ))).to.equal(
+        "There <del>wasn't</del><ins>isn't</ins> time \n \t   <ins>left </ins>"
+        + 'for all that<del>.</del><ins>,</ins> he <del>thought</del><ins>thinks</ins>.<del>..</del>'
+      );
+    });
+
+    it('rejects attempts to use a non-word Intl.Segmenter', () => {
+      const segmenter = new Intl.Segmenter('en', {granularity: 'grapheme'});
+      expect(() => {
+        diffWords('foo', 'bar', {intlSegmenter: segmenter});
+      }).to['throw']('The segmenter passed must have a granularity of "word"');
+    });
   });
 
   describe('#diffWordsWithSpace', function() {