Skip to content

Commit af59f34

Browse files
wooormFletjablko
authored
Fix to match GitHub’s algorithm on unicode
I reverse engineered GitHub’s slugging algorithm. Somewhat based on #25 and #35. To do that, I created two scripts: * `generate-fixtures.mjs`, which generates a markdown file, in part from manual fixtures and in part on the Unicode General Categories, creates a gist, crawls the gist, removes it, and saves fixtures annotated with the expected result from GitHub * `generate-regex.mjs`, which generates the regex that GitHub uses for characters to ignore. The regex is about 2.5kb minzipped. This increases the file size of this project a bit. But matching GitHub is worth it in my opinion. I also investigated regex `\p{}` classes in `/u` regexes. They work mostly fine, with two caveats: a) they don’t work everywhere, so would be a major release, b) GitHub does not implement the same Unicode version as browsers. I tested with Unicode 13 and 14, and they include characters that GitHub handles differently. In the end, GitHub’s algorithm is mostly fine: strip non-alphanumericals, allow `-`, and turn ` ` (space) into `-`. Finally, I removed the trim functionality, because it is not implemented by GitHub. To assert this, make a heading like so in a readme: `# &#x20;`. This is a space encoded as a character reference, meaning that the markdown does not see it as the whitespace between the `#` and the content. In fact, this makes it the content. And GitHub creates a slug of `-` for it. Closes GH-22. Closes GH-25. Closes GH-35. Closes GH-38. Co-authored-by: Dan Flettre <[email protected]> Co-authored-by: Jack Bates <[email protected]>
1 parent 156591b commit af59f34

17 files changed

+627
-290
lines changed

index.js

+2-8
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
1-
const emoji = require('emoji-regex')
1+
const regex = require('./regex.js')
22

33
module.exports = BananaSlug
44

55
const own = Object.hasOwnProperty
6-
const whitespace = /\s/g
7-
const specials = /[\u2000-\u206F\u2E00-\u2E7F\\'!"#$%&()*+,./:;<=>?@[\]^`{|}~]/g
86

97
function BananaSlug () {
108
const self = this
@@ -46,11 +44,7 @@ BananaSlug.prototype.reset = function () {
4644
function slugger (string, maintainCase) {
4745
if (typeof string !== 'string') return ''
4846
if (!maintainCase) string = string.toLowerCase()
49-
50-
return string.trim()
51-
.replace(specials, '')
52-
.replace(emoji(), '')
53-
.replace(whitespace, '-')
47+
return string.replace(regex, '').replace(/ /g, '-')
5448
}
5549

5650
BananaSlug.slug = slugger

package.json

+12-5
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,23 @@
1111
"url": "https://github.com/Flet/github-slugger/issues"
1212
},
1313
"files": [
14-
"index.js"
14+
"index.js",
15+
"regex.js"
1516
],
16-
"dependencies": {
17-
"emoji-regex": ">=6.0.0 <=6.1.1"
18-
},
1917
"devDependencies": {
18+
"@octokit/rest": "^18.0.0",
19+
"@unicode/unicode-12.1.0": "^1.0.0",
20+
"hast-util-select": "^5.0.0",
21+
"mdast-util-gfm": "^1.0.0",
22+
"mdast-util-to-markdown": "^1.0.0",
23+
"node-fetch": "^2.0.0",
2024
"nyc": "^15.0.0",
25+
"regenerate": "^1.0.0",
26+
"rehype-parse": "^8.0.0",
2127
"standard": "*",
2228
"tap-spec": "^5.0.0",
23-
"tape": "^4.0.0"
29+
"tape": "^4.0.0",
30+
"unified": "^10.0.0"
2431
},
2532
"homepage": "https://github.com/Flet/github-slugger",
2633
"keywords": [

regex.js

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

script/generate-fixtures.mjs

+145
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import { promises as fs } from 'node:fs'
2+
import { Octokit } from '@octokit/rest'
3+
import fetch from 'node-fetch'
4+
import { unified } from 'unified'
5+
import rehypeParse from 'rehype-parse'
6+
import { select, selectAll } from 'hast-util-select'
7+
import { toMarkdown } from 'mdast-util-to-markdown'
8+
import { gfmToMarkdown } from 'mdast-util-gfm'
9+
10+
// Note: the GH token needs `gists` access!
11+
const ghToken = process.env.GH_TOKEN || process.env.GITHUB_TOKEN
12+
13+
if (!ghToken) {
14+
throw new Error('Missing GitHub token: expected `GH_TOKEN` in env')
15+
}
16+
17+
const octo = new Octokit({ auth: 'token ' + ghToken })
18+
const categoryBase = new URL('../node_modules/@unicode/unicode-12.1.0/General_Category/', import.meta.url)
19+
20+
// Take up to N samples from each category.
21+
const samples = 400
22+
23+
const otherTests = [
24+
{ name: 'Basic usage', input: 'alpha' },
25+
{ name: 'Basic usage (again)', input: 'alpha' },
26+
{ name: 'Camelcase', input: 'bravoCharlieDelta' },
27+
{ name: 'Prototypal injection: proto', input: '__proto__' },
28+
{ name: 'Prototypal injection: proto (again)', input: '__proto__' },
29+
{ name: 'Prototypal injection: has own', input: 'hasOwnProperty' },
30+
{ name: 'Repetition (1)', input: 'echo' },
31+
{ name: 'Repetition (2)', input: 'echo' },
32+
{ name: 'Repetition (3)', input: 'echo 1' },
33+
{ name: 'Repetition (4)', input: 'echo-1' },
34+
{ name: 'Repetition (5)', input: 'echo' },
35+
{ name: 'More repetition (1)', input: 'foxtrot-1' },
36+
{ name: 'More repetition (2)', input: 'foxtrot' },
37+
{ name: 'More repetition (3)', input: 'foxtrot' },
38+
{ name: 'Characters: dash', input: 'heading with a - dash' },
39+
{ name: 'Characters: underscore', input: 'heading with an _ underscore' },
40+
{ name: 'Characters: dot', input: 'heading with a period.txt' },
41+
{ name: 'Characters: dots, parents, brackets', input: 'exchange.bind_headers(exchange, routing [, bindCallback])' },
42+
{ name: 'Characters: space', input: ' ', markdownOverwrite: '# &#x20;' },
43+
{ name: 'Characters: initial space', input: ' a', markdownOverwrite: '# &#x20;a' },
44+
{ name: 'Characters: final space', input: 'a ', markdownOverwrite: '# a&#x20;' },
45+
{ name: 'Characters: initial and final spaces', input: ' a ', markdownOverwrite: '# &#x20;a&#x20;' },
46+
{ name: 'Characters: initial and final dashes', input: '-a-' },
47+
{ name: 'Characters: apostrophe', input: 'apostrophe’s should be trimmed' },
48+
{ name: 'Some more duplicates (1)', input: 'golf' },
49+
{ name: 'Some more duplicates (2)', input: 'golf' },
50+
{ name: 'Some more duplicates (3)', input: 'golf' },
51+
{ name: 'Non-ascii: ♥', input: 'I ♥ unicode' },
52+
{ name: 'Non-ascii: -', input: 'dash-dash' },
53+
{ name: 'Non-ascii: –', input: 'en–dash' },
54+
{ name: 'Non-ascii: –', input: 'em–dash' },
55+
{ name: 'Non-ascii: 😄', input: '😄 unicode emoji' },
56+
{ name: 'Non-ascii: 😄-😄', input: '😄-😄 unicode emoji' },
57+
{ name: 'Non-ascii: 😄_😄', input: '😄_😄 unicode emoji' },
58+
{ name: 'Non-ascii: 😄', input: '😄 - an emoji' },
59+
{ name: 'Non-ascii: :smile:', input: ':smile: - a gemoji' },
60+
{ name: 'Non-ascii: Cyrillic (1)', input: 'Привет' },
61+
{ name: 'Non-ascii: Cyrillic (2)', input: 'Профили пользователей' },
62+
{ name: 'Non-ascii: Cyrillic + Han', input: 'Привет non-latin 你好' },
63+
{ name: 'Gemoji (1)', input: ':ok: No underscore' },
64+
{ name: 'Gemoji (2)', input: ':ok_hand: Single' },
65+
{ name: 'Gemoji (3)', input: ':ok_hand::hatched_chick: Two in a row with no spaces' },
66+
{ name: 'Gemoji (4)', input: ':ok_hand: :hatched_chick: Two in a row' }
67+
]
68+
69+
main()
70+
71+
async function main () {
72+
const files = await fs.readdir(categoryBase)
73+
const tests = [...otherTests]
74+
let index = -1
75+
76+
// Create a test case with a bunch of examples.
77+
while (++index < files.length) {
78+
const name = files[index]
79+
80+
if (name === 'index.js') continue
81+
82+
// These result in Git(Hub) thinking it’s a binary file.
83+
if (name === 'Control' || name === 'Surrogate') continue
84+
85+
// This prevents GH from rendering markdown to HTML.
86+
if (name === 'Other') continue
87+
88+
const fp = `./${name}/code-points.js`
89+
const { default: codePoints } = await import(new URL(fp, categoryBase))
90+
const subs = []
91+
92+
let n = -1
93+
94+
while (++n < samples) {
95+
subs.push(codePoints[Math.floor(codePoints.length / samples * n)])
96+
}
97+
98+
subs.push(codePoints[codePoints.length - 1])
99+
100+
tests.push({ name, input: 'a' + [...new Set(subs)].map(d => String.fromCodePoint(d)).join(' ') + 'b' })
101+
}
102+
103+
// Create a Gist.
104+
const filename = 'readme.md'
105+
const gistResult = await octo.gists.create({
106+
files: {
107+
[filename]: {
108+
content: tests.map(d => {
109+
return d.markdownOverwrite || toMarkdown({ type: 'heading', depth: 1, children: [{ type: 'text', value: d.input }] }, { extensions: [gfmToMarkdown()] })
110+
}).join('\n\n')
111+
}
112+
}
113+
})
114+
115+
const file = gistResult.data.files[filename]
116+
117+
if (!file.language) {
118+
throw new Error('The generated markdown was seen as binary data instead of text by GitHub. This is likely because there are weird characters (such as control characters or lone surrogates) in it')
119+
}
120+
121+
// Fetch the rendered page.
122+
const response = await fetch(gistResult.data.html_url, {
123+
headers: { Authorization: 'token ' + ghToken }
124+
})
125+
126+
const doc = await response.text()
127+
128+
// Remove the Gist.
129+
await octo.gists.delete({ gist_id: gistResult.data.id })
130+
131+
const tree = unified().use(rehypeParse).parse(doc)
132+
const markdownBody = select('.markdown-body', tree)
133+
134+
if (!markdownBody) {
135+
throw new Error('The generated markdown could not be rendered by GitHub as HTML. This is likely because there are weird characters in it')
136+
}
137+
138+
const anchors = selectAll('h1 .anchor', markdownBody)
139+
140+
anchors.forEach((node, i) => {
141+
tests[i].expected = node.properties.href.slice(1)
142+
})
143+
144+
await fs.writeFile(new URL('../test/fixtures.json', import.meta.url), JSON.stringify(tests, null, 2) + '\n')
145+
}

script/generate-regex.mjs

+62
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import { promises as fs } from 'node:fs'
2+
import regenerate from 'regenerate'
3+
import alphabetics from '@unicode/unicode-12.1.0/Binary_Property/Alphabetic/code-points.js'
4+
5+
const categoryBase = new URL('../node_modules/@unicode/unicode-12.1.0/General_Category/', import.meta.url)
6+
7+
// Unicode General Categories to remove.
8+
const ranges = [
9+
// Some numbers:
10+
'Other_Number',
11+
12+
// Some punctuation:
13+
'Close_Punctuation',
14+
'Final_Punctuation',
15+
'Initial_Punctuation',
16+
'Open_Punctuation',
17+
'Other_Punctuation',
18+
// All except a normal `-` (dash)
19+
'Dash_Punctuation',
20+
21+
// All:
22+
'Symbol',
23+
'Control',
24+
'Private_Use',
25+
'Format',
26+
'Unassigned',
27+
28+
// All except a normal ` ` (space)
29+
'Separator'
30+
]
31+
32+
main()
33+
34+
async function main () {
35+
const generator = regenerate()
36+
37+
let index = -1
38+
39+
// Add code points to strip.
40+
while (++index < ranges.length) {
41+
const name = ranges[index]
42+
const fp = `./${name}/code-points.js`
43+
const { default: codePoints } = await import(new URL(fp, categoryBase))
44+
45+
generator.add(codePoints)
46+
}
47+
48+
generator
49+
// Some overlap between letters and Other Symbol.
50+
.remove(alphabetics)
51+
// Spaces are turned to `-`
52+
.remove(' ')
53+
// Dash is kept.
54+
.remove('-')
55+
56+
await fs.writeFile('regex.js', [
57+
'// This module is generated by `script/`.',
58+
'/* eslint-disable no-control-regex, no-misleading-character-class, no-useless-escape */',
59+
'module.exports = ' + generator.toRegExp() + 'g',
60+
''
61+
].join('\n'))
62+
}

test/1-basic-usage.md

-5
This file was deleted.

test/2-camel-case.md

-5
This file was deleted.

test/3-prototype.md

-7
This file was deleted.

test/4-matching-slugs-basic.md

-9
This file was deleted.

test/5-matching-slugs-again.md

-5
This file was deleted.

test/6-characters.md

-17
This file was deleted.

test/7-duplicates.md

-5
This file was deleted.

test/8-non-ascii.md

-23
This file was deleted.

test/9-emoji.md

-7
This file was deleted.

0 commit comments

Comments
 (0)