Skip to content

Commit aa0f744

Browse files
Handle req file decode failures on locale encoding
For the case where: * a requirements file is encoded as UTF-8, and * some bytes in the file are incompatible with the system locale In this case, fallback to decoding as UTF-8 as a last resort (rather than crashing on the `UnicodeDecodeError`). This behaviour was added when parsing the request file, rather than in `auto_decode` as it didn't seem to belong in a generic util (though that util looks to only be ever called when parsing requirements files anyway). Perhaps we should just go straight to UTF-8 without querying the system locale (unless there is a PEP-263 style comment), per the docs[1]: > Requirements files are utf-8 encoding by default But to avoid a breaking change just warn if decoding with this locale fails then fallback to UTF-8 [1] https://pip.pypa.io/en/stable/reference/requirements-file-format/#encoding
1 parent 300ed75 commit aa0f744

File tree

3 files changed

+59
-1
lines changed

3 files changed

+59
-1
lines changed

news/9a3e9584-3fd4-4840-916b-414c164f9c28.trivial.rst

Whitespace-only changes.

src/pip/_internal/req/req_file.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import re
99
import shlex
1010
import urllib.parse
11+
import warnings
1112
from optparse import Values
1213
from typing import (
1314
TYPE_CHECKING,
@@ -545,7 +546,23 @@ def get_file_content(url: str, session: "PipSession") -> Tuple[str, str]:
545546
# Assume this is a bare path.
546547
try:
547548
with open(url, "rb") as f:
548-
content = auto_decode(f.read())
549+
raw_content = f.read()
549550
except OSError as exc:
550551
raise InstallationError(f"Could not open requirements file: {exc}")
552+
553+
try:
554+
content = auto_decode(raw_content)
555+
except UnicodeDecodeError as exc:
556+
fallback_encoding = "utf-8"
557+
# don't try an decode again if we know it will fail
558+
if exc.encoding == fallback_encoding:
559+
raise
560+
561+
warnings.warn(
562+
f"unable to decode data with {exc.encoding}, falling back to {fallback_encoding}", # noqa: E501
563+
UnicodeWarning,
564+
stacklevel=2,
565+
)
566+
content = raw_content.decode(fallback_encoding)
567+
551568
return url, content

tests/unit/test_req_file.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import logging
33
import os
44
import textwrap
5+
import warnings
56
from optparse import Values
67
from pathlib import Path
78
from typing import Any, Iterator, List, Optional, Protocol, Tuple, Union
@@ -883,3 +884,43 @@ def test_install_requirements_with_options(
883884
)
884885

885886
assert req.global_options == [global_option]
887+
888+
def test_warns_on_decode_fail_in_locale(
889+
self, tmpdir: Path, session: PipSession
890+
) -> None:
891+
# \xe3\x80\x82 encodes to 'IDEOGRAPHIC FULL STOP' in UTF-8
892+
# the lone \x82 byte is invalid in the gbk encoding
893+
data = b"pip<=24.0 # some comment\xe3\x80\x82\n"
894+
locale_encoding = "gbk"
895+
req_file = tmpdir / "requirements.txt"
896+
req_file.write_bytes(data)
897+
898+
# it's hard to rely on a locale definitely existing for testing
899+
# so patch things out for simplicity
900+
with pytest.warns(UnicodeWarning) as records, mock.patch(
901+
"locale.getpreferredencoding", return_value=locale_encoding
902+
):
903+
reqs = tuple(parse_reqfile(req_file.resolve(), session=session))
904+
905+
assert len(records) == 1
906+
assert (
907+
str(records[0].message)
908+
== "unable to decode data with gbk, falling back to utf-8"
909+
)
910+
assert len(reqs) == 1
911+
assert reqs[0].name == "pip"
912+
assert str(reqs[0].specifier) == "<=24.0"
913+
914+
@pytest.mark.parametrize("encoding", ("utf-8", "gbk"))
915+
def test_erorrs_on_non_decodable_data(
916+
self, encoding: str, tmpdir: Path, session: PipSession
917+
) -> None:
918+
data = b"\xff"
919+
req_file = tmpdir / "requirements.txt"
920+
req_file.write_bytes(data)
921+
922+
with warnings.catch_warnings(), pytest.raises(UnicodeDecodeError), mock.patch(
923+
"locale.getpreferredencoding", return_value=encoding
924+
):
925+
warnings.simplefilter("ignore") # suppress warning not under test here
926+
next(parse_reqfile(req_file.resolve(), session=session))

0 commit comments

Comments
 (0)