Skip to content

Commit d0bf895

Browse files
Reorder requirements file decoding
This changes the decoding process to be more in line with what was previously documented. The new process is outlined in the updated docs. The `auto_decode` function was removed and all decoding logic moved to the `pip._internal.req.req_file` module because: * This function was only ever used to decode requirements file * It was never really a generic 'util' function, it was always tied to the idiosyncrasies of decoding requirements files. * The module lived under `_internal` so I felt comfortable removing it A warning was added when we _do_ fallback to using the locale defined encoding to encourage users to move to an explicit encoding definition via a coding style comment. This fixes two existing bugs. Firstly, when: * a requirements file is encoded as UTF-8, and * some bytes in the file are incompatible with the system locale Previously, assuming no BOM or PEP-263 style comment, we would default to using the encoding from the system locale, which would then fail (see issue #12771) Secondly, when decoding a file starting with a UTF-32 little endian Byte Order Marker. Previously this would always fail since `codecs.BOM_UTF32_LE` is `codecs.BOM_UTF16_LE` followed by two null bytes, and because of the ordering of the list of BOMs we the UTF-16 case would be run first and match the file prefix so we would incorrectly deduce that the file was UTF-16 little endian encoded. I can't imagine this is a popular encoding for a requirements file. Fixes: #12771
1 parent 6958e28 commit d0bf895

File tree

6 files changed

+179
-86
lines changed

6 files changed

+179
-86
lines changed

docs/html/reference/requirements-file-format.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,15 @@ examples of all these forms, see {ref}`pip install Examples`.
5656

5757
### Encoding
5858

59-
Requirements files are `utf-8` encoding by default and also support
60-
{pep}`263` style comments to change the encoding (i.e.
61-
`# -*- coding: <encoding name> -*-`).
59+
It is simplest to encode your requirements files with UTF-8.
60+
The process for decoding requirements files is:
61+
62+
- Check for any Byte Order Mark at the start of the file and if found use
63+
the corresponding encoding to decode the file.
64+
- Check for any {pep}`263` style comment (e.g. `# -*- coding: <encoding name> -*-`)
65+
and if found decode with the given encoding.
66+
- Try and decode with UTF-8, and if that fails,
67+
- fallback to trying to decode using the locale defined encoding.
6268

6369
### Line continuations
6470

news/12771.feature.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Reorder the encoding detection when decoding a requirements file, relying on
2+
UTF-8 over the locale encoding by default.

src/pip/_internal/req/req_file.py

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@
22
Requirements file parsing
33
"""
44

5+
import codecs
6+
import locale
57
import logging
68
import optparse
79
import os
810
import re
911
import shlex
12+
import sys
1013
import urllib.parse
1114
from optparse import Values
1215
from typing import (
@@ -25,7 +28,6 @@
2528
from pip._internal.cli import cmdoptions
2629
from pip._internal.exceptions import InstallationError, RequirementsFileParseError
2730
from pip._internal.models.search_scope import SearchScope
28-
from pip._internal.utils.encoding import auto_decode
2931

3032
if TYPE_CHECKING:
3133
from pip._internal.index.package_finder import PackageFinder
@@ -563,7 +565,56 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]:
563565
# Assume this is a bare path.
564566
try:
565567
with open(url, "rb") as f:
566-
content = auto_decode(f.read())
568+
raw_content = f.read()
567569
except OSError as exc:
568570
raise InstallationError(f"Could not open requirements file: {exc}")
571+
572+
content = _decode_req_file(raw_content, url)
573+
569574
return url, content
575+
576+
577+
BOMS: List[Tuple[bytes, str]] = [
578+
(codecs.BOM_UTF8, "utf-8"),
579+
(codecs.BOM_UTF32, "utf-32"),
580+
(codecs.BOM_UTF32_BE, "utf-32-be"),
581+
(codecs.BOM_UTF32_LE, "utf-32-le"),
582+
(codecs.BOM_UTF16, "utf-16"),
583+
(codecs.BOM_UTF16_BE, "utf-16-be"),
584+
(codecs.BOM_UTF16_LE, "utf-16-le"),
585+
]
586+
587+
ENCODING_RE = re.compile(rb"coding[:=]\s*([-\w.]+)")
588+
DEFAULT_ENCODING = "utf-8"
589+
590+
591+
def _decode_req_file(data: bytes, url: str) -> str:
592+
# order of BOMS is important: codecs.BOM_UTF16_LE is a prefix of codecs.BOM_UTF32_LE
593+
# so data.startswith(BOM_UTF16_LE) would be true for UTF32_LE data
594+
for bom, encoding in BOMS:
595+
if data.startswith(bom):
596+
return data[len(bom) :].decode(encoding)
597+
598+
# PEP-263 style comments
599+
for line in data.split(b"\n")[:2]:
600+
if line[0:1] == b"#" and ENCODING_RE.search(line):
601+
result = ENCODING_RE.search(line)
602+
assert result is not None
603+
encoding = result.groups()[0].decode("ascii")
604+
return data.decode(encoding)
605+
606+
try:
607+
return data.decode(DEFAULT_ENCODING)
608+
except UnicodeDecodeError:
609+
locale_encoding = locale.getpreferredencoding(False) or sys.getdefaultencoding()
610+
logging.warning(
611+
"unable to decode data from %s with default encoding %s, "
612+
"falling back to encoding from locale: %s. "
613+
"If this is intentional you should specify the encoding with a "
614+
"PEP-263 style comment, e.g. '# -*- coding: %s -*-'",
615+
url,
616+
DEFAULT_ENCODING,
617+
locale_encoding,
618+
locale_encoding,
619+
)
620+
return data.decode(locale_encoding)

src/pip/_internal/utils/encoding.py

Lines changed: 0 additions & 36 deletions
This file was deleted.

tests/unit/test_req_file.py

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import codecs
12
import collections
23
import logging
34
import os
@@ -942,3 +943,116 @@ def test_install_requirements_with_options(
942943
)
943944

944945
assert req.global_options == [global_option]
946+
947+
@pytest.mark.parametrize(
948+
"raw_req_file,expected_name,expected_spec",
949+
[
950+
pytest.param(
951+
b"Django==1.4.2",
952+
"Django",
953+
"==1.4.2",
954+
id="defaults to UTF-8",
955+
),
956+
pytest.param(
957+
"# coding=latin1\nDjango==1.4.2 # Pas trop de café".encode("latin-1"),
958+
"Django",
959+
"==1.4.2",
960+
id="decodes based on PEP-263 style headers",
961+
),
962+
],
963+
)
964+
def test_general_decoding(
965+
self,
966+
raw_req_file: bytes,
967+
expected_name: str,
968+
expected_spec: str,
969+
tmpdir: Path,
970+
session: PipSession,
971+
) -> None:
972+
req_file = tmpdir / "requirements.txt"
973+
req_file.write_bytes(raw_req_file)
974+
975+
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
976+
977+
assert len(reqs) == 1
978+
assert reqs[0].name == expected_name
979+
assert reqs[0].specifier == expected_spec
980+
981+
@pytest.mark.parametrize(
982+
"bom,encoding",
983+
[
984+
(codecs.BOM_UTF8, "utf-8"),
985+
(codecs.BOM_UTF16_BE, "utf-16-be"),
986+
(codecs.BOM_UTF16_LE, "utf-16-le"),
987+
(codecs.BOM_UTF32_BE, "utf-32-be"),
988+
(codecs.BOM_UTF32_LE, "utf-32-le"),
989+
# BOM automatically added when encoding byte-order dependent encodings
990+
(b"", "utf-16"),
991+
(b"", "utf-32"),
992+
],
993+
)
994+
def test_decoding_with_BOM(
995+
self, bom: bytes, encoding: str, tmpdir: Path, session: PipSession
996+
) -> None:
997+
req_name = "Django"
998+
req_specifier = "==1.4.2"
999+
encoded_contents = bom + f"{req_name}{req_specifier}".encode(encoding)
1000+
req_file = tmpdir / "requirements.txt"
1001+
req_file.write_bytes(encoded_contents)
1002+
1003+
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
1004+
1005+
assert len(reqs) == 1
1006+
assert reqs[0].name == req_name
1007+
assert reqs[0].specifier == req_specifier
1008+
1009+
def test_warns_and_fallsback_to_locale_on_utf8_decode_fail(
1010+
self,
1011+
tmpdir: Path,
1012+
session: PipSession,
1013+
caplog: pytest.LogCaptureFixture,
1014+
) -> None:
1015+
# \xff is valid in latin-1 but not UTF-8
1016+
data = b"pip<=24.0 # some comment\xff\n"
1017+
locale_encoding = "latin-1"
1018+
req_file = tmpdir / "requirements.txt"
1019+
req_file.write_bytes(data)
1020+
1021+
# it's hard to rely on a locale definitely existing for testing
1022+
# so patch things out for simplicity
1023+
with caplog.at_level(logging.WARNING), mock.patch(
1024+
"locale.getpreferredencoding", return_value=locale_encoding
1025+
):
1026+
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
1027+
1028+
assert len(caplog.records) == 1
1029+
assert (
1030+
caplog.records[0].msg
1031+
== "unable to decode data from %s with default encoding %s, "
1032+
"falling back to encoding from locale: %s. "
1033+
"If this is intentional you should specify the encoding with a "
1034+
"PEP-263 style comment, e.g. '# -*- coding: %s -*-'"
1035+
)
1036+
assert caplog.records[0].args == (
1037+
str(req_file),
1038+
"utf-8",
1039+
locale_encoding,
1040+
locale_encoding,
1041+
)
1042+
1043+
assert len(reqs) == 1
1044+
assert reqs[0].name == "pip"
1045+
assert str(reqs[0].specifier) == "<=24.0"
1046+
1047+
@pytest.mark.parametrize("encoding", ["utf-8", "gbk"])
1048+
def test_erorrs_on_non_decodable_data(
1049+
self, encoding: str, tmpdir: Path, session: PipSession
1050+
) -> None:
1051+
data = b"\xff"
1052+
req_file = tmpdir / "requirements.txt"
1053+
req_file.write_bytes(data)
1054+
1055+
with pytest.raises(UnicodeDecodeError), mock.patch(
1056+
"locale.getpreferredencoding", return_value=encoding
1057+
):
1058+
next(parse_reqfile(req_file.resolve(), session=session))

tests/unit/test_utils.py

Lines changed: 1 addition & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
44
"""
55

6-
import codecs
76
import os
87
import shutil
98
import stat
@@ -12,7 +11,7 @@
1211
from io import BytesIO
1312
from pathlib import Path
1413
from typing import Any, Callable, Iterator, List, NoReturn, Optional, Tuple, Type
15-
from unittest.mock import Mock, patch
14+
from unittest.mock import Mock
1615

1716
import pytest
1817

@@ -21,7 +20,6 @@
2120
from pip._internal.exceptions import HashMismatch, HashMissing, InstallationError
2221
from pip._internal.utils.deprecation import PipDeprecationWarning, deprecated
2322
from pip._internal.utils.egg_link import egg_link_path_from_location
24-
from pip._internal.utils.encoding import BOMS, auto_decode
2523
from pip._internal.utils.glibc import (
2624
glibc_version_string,
2725
glibc_version_string_confstr,
@@ -445,48 +443,6 @@ def test_has_one_of(self) -> None:
445443
assert not empty_hashes.has_one_of({"sha256": "xyzt"})
446444

447445

448-
class TestEncoding:
449-
"""Tests for pip._internal.utils.encoding"""
450-
451-
def test_auto_decode_utf_16_le(self) -> None:
452-
data = (
453-
b"\xff\xfeD\x00j\x00a\x00n\x00g\x00o\x00=\x00"
454-
b"=\x001\x00.\x004\x00.\x002\x00"
455-
)
456-
assert data.startswith(codecs.BOM_UTF16_LE)
457-
assert auto_decode(data) == "Django==1.4.2"
458-
459-
def test_auto_decode_utf_16_be(self) -> None:
460-
data = (
461-
b"\xfe\xff\x00D\x00j\x00a\x00n\x00g\x00o\x00="
462-
b"\x00=\x001\x00.\x004\x00.\x002"
463-
)
464-
assert data.startswith(codecs.BOM_UTF16_BE)
465-
assert auto_decode(data) == "Django==1.4.2"
466-
467-
def test_auto_decode_no_bom(self) -> None:
468-
assert auto_decode(b"foobar") == "foobar"
469-
470-
def test_auto_decode_pep263_headers(self) -> None:
471-
latin1_req = "# coding=latin1\n# Pas trop de café"
472-
assert auto_decode(latin1_req.encode("latin1")) == latin1_req
473-
474-
def test_auto_decode_no_preferred_encoding(self) -> None:
475-
om, em = Mock(), Mock()
476-
om.return_value = "ascii"
477-
em.return_value = None
478-
data = "data"
479-
with patch("sys.getdefaultencoding", om):
480-
with patch("locale.getpreferredencoding", em):
481-
ret = auto_decode(data.encode(sys.getdefaultencoding()))
482-
assert ret == data
483-
484-
@pytest.mark.parametrize("encoding", [encoding for bom, encoding in BOMS])
485-
def test_all_encodings_are_valid(self, encoding: str) -> None:
486-
# we really only care that there is no LookupError
487-
assert "".encode(encoding).decode(encoding) == ""
488-
489-
490446
def raises(error: Type[Exception]) -> NoReturn:
491447
raise error
492448

0 commit comments

Comments
 (0)