Skip to content

Commit 65ffd8d

Browse files
committed
Automatically detect UTF8 character encoding in output
1 parent 16dfb3f commit 65ffd8d

File tree

7 files changed

+270
-147
lines changed

7 files changed

+270
-147
lines changed

Diff for: src/core/lib/ChrEnc.mjs

+79-2
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,85 @@ export function chrEncWidth(page) {
224224
* @copyright Crown Copyright 2019
225225
* @license Apache-2.0
226226
*/
227+
export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"];
228+
227229

228230
/**
229-
* Character encoding format mappings.
231+
* Detects whether the input buffer is valid UTF8.
232+
*
233+
* @param {ArrayBuffer} data
234+
* @returns {number} - 0 = not UTF8, 1 = ASCII, 2 = UTF8
230235
*/
231-
export const UNICODE_NORMALISATION_FORMS = ["NFD", "NFC", "NFKD", "NFKC"];
236+
export function isUTF8(data) {
237+
const bytes = new Uint8Array(data);
238+
let i = 0;
239+
let onlyASCII = true;
240+
while (i < bytes.length) {
241+
if (( // ASCII
242+
bytes[i] === 0x09 ||
243+
bytes[i] === 0x0A ||
244+
bytes[i] === 0x0D ||
245+
(0x20 <= bytes[i] && bytes[i] <= 0x7E)
246+
)) {
247+
i += 1;
248+
continue;
249+
}
250+
251+
onlyASCII = false;
252+
253+
if (( // non-overlong 2-byte
254+
(0xC2 <= bytes[i] && bytes[i] <= 0xDF) &&
255+
(0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
256+
)) {
257+
i += 2;
258+
continue;
259+
}
260+
261+
if (( // excluding overlongs
262+
bytes[i] === 0xE0 &&
263+
(0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
264+
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF)
265+
) ||
266+
( // straight 3-byte
267+
((0xE1 <= bytes[i] && bytes[i] <= 0xEC) ||
268+
bytes[i] === 0xEE ||
269+
bytes[i] === 0xEF) &&
270+
(0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) &&
271+
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
272+
) ||
273+
( // excluding surrogates
274+
bytes[i] === 0xED &&
275+
(0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) &&
276+
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
277+
)) {
278+
i += 3;
279+
continue;
280+
}
281+
282+
if (( // planes 1-3
283+
bytes[i] === 0xF0 &&
284+
(0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
285+
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
286+
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
287+
) ||
288+
( // planes 4-15
289+
(0xF1 <= bytes[i] && bytes[i] <= 0xF3) &&
290+
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
291+
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
292+
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
293+
) ||
294+
( // plane 16
295+
bytes[i] === 0xF4 &&
296+
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) &&
297+
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
298+
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
299+
)) {
300+
i += 4;
301+
continue;
302+
}
303+
304+
return 0;
305+
}
306+
307+
return onlyASCII ? 1 : 2;
308+
}

Diff for: src/core/lib/Magic.mjs

+2-77
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import Utils, { isWorkerEnvironment } from "../Utils.mjs";
33
import Recipe from "../Recipe.mjs";
44
import Dish from "../Dish.mjs";
55
import {detectFileType, isType} from "./FileType.mjs";
6+
import {isUTF8} from "./ChrEnc.mjs";
67
import chiSquared from "chi-squared";
78

89
/**
@@ -111,82 +112,6 @@ class Magic {
111112
};
112113
}
113114

114-
/**
115-
* Detects whether the input buffer is valid UTF8.
116-
*
117-
* @returns {boolean}
118-
*/
119-
isUTF8() {
120-
const bytes = new Uint8Array(this.inputBuffer);
121-
let i = 0;
122-
while (i < bytes.length) {
123-
if (( // ASCII
124-
bytes[i] === 0x09 ||
125-
bytes[i] === 0x0A ||
126-
bytes[i] === 0x0D ||
127-
(0x20 <= bytes[i] && bytes[i] <= 0x7E)
128-
)) {
129-
i += 1;
130-
continue;
131-
}
132-
133-
if (( // non-overlong 2-byte
134-
(0xC2 <= bytes[i] && bytes[i] <= 0xDF) &&
135-
(0x80 <= bytes[i+1] && bytes[i+1] <= 0xBF)
136-
)) {
137-
i += 2;
138-
continue;
139-
}
140-
141-
if (( // excluding overlongs
142-
bytes[i] === 0xE0 &&
143-
(0xA0 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
144-
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF)
145-
) ||
146-
( // straight 3-byte
147-
((0xE1 <= bytes[i] && bytes[i] <= 0xEC) ||
148-
bytes[i] === 0xEE ||
149-
bytes[i] === 0xEF) &&
150-
(0x80 <= bytes[i + 1] && bytes[i+1] <= 0xBF) &&
151-
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
152-
) ||
153-
( // excluding surrogates
154-
bytes[i] === 0xED &&
155-
(0x80 <= bytes[i+1] && bytes[i+1] <= 0x9F) &&
156-
(0x80 <= bytes[i+2] && bytes[i+2] <= 0xBF)
157-
)) {
158-
i += 3;
159-
continue;
160-
}
161-
162-
if (( // planes 1-3
163-
bytes[i] === 0xF0 &&
164-
(0x90 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
165-
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
166-
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
167-
) ||
168-
( // planes 4-15
169-
(0xF1 <= bytes[i] && bytes[i] <= 0xF3) &&
170-
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0xBF) &&
171-
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
172-
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
173-
) ||
174-
( // plane 16
175-
bytes[i] === 0xF4 &&
176-
(0x80 <= bytes[i + 1] && bytes[i + 1] <= 0x8F) &&
177-
(0x80 <= bytes[i + 2] && bytes[i + 2] <= 0xBF) &&
178-
(0x80 <= bytes[i + 3] && bytes[i + 3] <= 0xBF)
179-
)) {
180-
i += 4;
181-
continue;
182-
}
183-
184-
return false;
185-
}
186-
187-
return true;
188-
}
189-
190115
/**
191116
* Calculates the Shannon entropy of the input data.
192117
*
@@ -336,7 +261,7 @@ class Magic {
336261
data: this.inputStr.slice(0, 100),
337262
languageScores: this.detectLanguage(extLang),
338263
fileType: this.detectFileType(),
339-
isUTF8: this.isUTF8(),
264+
isUTF8: !!isUTF8(this.inputBuffer),
340265
entropy: this.calcEntropy(),
341266
matchingOps: matchingOps,
342267
useful: useful,

Diff for: src/web/App.mjs

+4-4
Original file line numberDiff line numberDiff line change
@@ -500,22 +500,22 @@ class App {
500500
// Input Character Encoding
501501
// Must be set before the input is loaded
502502
if (this.uriParams.ienc) {
503-
this.manager.input.chrEncChange(parseInt(this.uriParams.ienc, 10));
503+
this.manager.input.chrEncChange(parseInt(this.uriParams.ienc, 10), true);
504504
}
505505

506506
// Output Character Encoding
507507
if (this.uriParams.oenc) {
508-
this.manager.output.chrEncChange(parseInt(this.uriParams.oenc, 10));
508+
this.manager.output.chrEncChange(parseInt(this.uriParams.oenc, 10), true);
509509
}
510510

511511
// Input EOL sequence
512512
if (this.uriParams.ieol) {
513-
this.manager.input.eolChange(this.uriParams.ieol);
513+
this.manager.input.eolChange(this.uriParams.ieol, true);
514514
}
515515

516516
// Output EOL sequence
517517
if (this.uriParams.oeol) {
518-
this.manager.output.eolChange(this.uriParams.oeol);
518+
this.manager.output.eolChange(this.uriParams.oeol, true);
519519
}
520520

521521
// Read in input data from URI params

Diff for: src/web/stylesheets/components/_operation.css

+5-1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ select.arg {
6969
min-width: 100px;
7070
}
7171

72+
select.arg.form-control:not([size]):not([multiple]), select.custom-file-control:not([size]):not([multiple]) {
73+
height: 100% !important;
74+
}
75+
7276
textarea.arg {
7377
min-height: 74px;
7478
resize: vertical;
@@ -80,7 +84,7 @@ div.toggle-string {
8084

8185
input.toggle-string {
8286
border-top-right-radius: 0 !important;
83-
height: 42px !important;
87+
height: 100%;
8488
}
8589

8690
.operation [class^='bmd-label'],

Diff for: src/web/utils/statusBar.mjs

+45-6
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ class StatusBarPanel {
2424
this.eolHandler = opts.eolHandler;
2525
this.chrEncHandler = opts.chrEncHandler;
2626
this.chrEncGetter = opts.chrEncGetter;
27+
this.getEncodingState = opts.getEncodingState;
28+
this.getEOLState = opts.getEOLState;
2729
this.htmlOutput = opts.htmlOutput;
2830

2931
this.eolVal = null;
@@ -115,7 +117,7 @@ class StatusBarPanel {
115117

116118
if (isNaN(chrEncVal)) return;
117119

118-
this.chrEncHandler(chrEncVal);
120+
this.chrEncHandler(chrEncVal, true);
119121
this.updateCharEnc(chrEncVal);
120122
hideElement(e.target.closest(".cm-status-bar-select-content"));
121123
}
@@ -212,12 +214,31 @@ class StatusBarPanel {
212214
* @param {EditorState} state
213215
*/
214216
updateEOL(state) {
215-
if (state.lineBreak === this.eolVal) return;
217+
if (this.getEOLState() < 2 && state.lineBreak === this.eolVal) return;
216218

217219
const val = this.dom.querySelector(".eol-value");
218220
const button = val.closest(".cm-status-bar-select-btn");
219-
const eolCode = eolSeqToCode[state.lineBreak];
220-
const eolName = eolCodeToName[eolCode];
221+
let eolCode = eolSeqToCode[state.lineBreak];
222+
let eolName = eolCodeToName[eolCode];
223+
224+
switch (this.getEOLState()) {
225+
case 1: // Detected
226+
val.classList.add("font-italic");
227+
eolCode += " (detected)";
228+
eolName += " (detected)";
229+
// Pulse
230+
val.classList.add("pulse");
231+
setTimeout(() => {
232+
val.classList.remove("pulse");
233+
}, 2000);
234+
break;
235+
case 0: // Unset
236+
case 2: // Manually set
237+
default:
238+
val.classList.remove("font-italic");
239+
break;
240+
}
241+
221242
val.textContent = eolCode;
222243
button.setAttribute("title", `End of line sequence:<br>${eolName}`);
223244
button.setAttribute("data-original-title", `End of line sequence:<br>${eolName}`);
@@ -230,12 +251,30 @@ class StatusBarPanel {
230251
*/
231252
updateCharEnc() {
232253
const chrEncVal = this.chrEncGetter();
233-
if (chrEncVal === this.chrEncVal) return;
254+
if (this.getEncodingState() < 2 && chrEncVal === this.chrEncVal) return;
234255

235-
const name = CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] ? CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] : "Raw Bytes";
256+
let name = CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] ? CHR_ENC_SIMPLE_REVERSE_LOOKUP[chrEncVal] : "Raw Bytes";
236257

237258
const val = this.dom.querySelector(".chr-enc-value");
238259
const button = val.closest(".cm-status-bar-select-btn");
260+
261+
switch (this.getEncodingState()) {
262+
case 1: // Detected
263+
val.classList.add("font-italic");
264+
name += " (detected)";
265+
// Pulse
266+
val.classList.add("pulse");
267+
setTimeout(() => {
268+
val.classList.remove("pulse");
269+
}, 2000);
270+
break;
271+
case 0: // Unset
272+
case 2: // Manually set
273+
default:
274+
val.classList.remove("font-italic");
275+
break;
276+
}
277+
239278
val.textContent = name;
240279
button.setAttribute("title", `${this.label} character encoding:<br>${name}`);
241280
button.setAttribute("data-original-title", `${this.label} character encoding:<br>${name}`);

0 commit comments

Comments
 (0)