Smarter (and looser) link equivalency logic

uranusjr · uranusjr · commit a323183dc435 · 2021-06-22T12:13:23.000+08:00
diff --git a/news/10002.feature.rst b/news/10002.feature.rst
@@ -0,0 +1,6 @@
+New resolver: Loosen URL comparison logic when checking for direct URL reference
+equivalency. The logic includes the following notable characteristics:
+
+* The authentication part of the URL is explicitly ignored.
+* The ``egg=`` fragment is explicitly ignored.
+* Query and fragment parts of the URL are hashed to allow ordering differences.
diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
@@ -1,8 +1,10 @@
+import functools
+import logging
 import os
 import posixpath
 import re
 import urllib.parse
-from typing import TYPE_CHECKING, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Tuple, Union
 
 from pip._internal.utils.filetypes import WHEEL_EXTENSION
 from pip._internal.utils.hashes import Hashes
@@ -17,6 +19,8 @@
 if TYPE_CHECKING:
     from pip._internal.index.collector import HTMLPage
 
+logger = logging.getLogger(__name__)
+
 
 class Link(KeyBasedCompareMixin):
     """Represents a parsed link from a Package Index's simple URL
@@ -242,7 +246,49 @@ def is_hash_allowed(self, hashes):
         return hashes.is_hash_allowed(self.hash_name, hex_digest=self.hash)
 
 
-# TODO: Relax this comparison logic to ignore, for example, fragments.
+class _CleanResult(NamedTuple):
+    """Convert link for equivalency check.
+
+    This is used in the resolver to check whether two URL-specified requirements
+    likely point to the same distribution and can be considered equivalent. This
+    equivalency logic avoids comparing URLs literally, which can be too strict
+    (e.g. "a=1&b=2" vs "b=2&a=1") and produce conflicts unexpecting to users.
+
+    Currently this does three things:
+
+    1. Drop the basic auth part. This is technically wrong since a server can
+       serve different content based on auth, but if it does that, it is even
+       impossible to guarantee two URLs without auth are equivalent, since
+       the user can input different auth information when prompted. So the
+       practical solution is to assume the auth doesn't affect the response.
+    2. Parse the query to avoid the ordering issue.
+    3. Parse the fragment, and explicitly drop the "egg=" part since it is
+       commonly provided as the project name for compatibility. This is wrong in
+       the strictest sense, but too many people are doing it.
+
+    Note that query value ordering under the same key in query and fragment are
+    NOT cleaned; i.e. "a=1&a=2" and "a=2&a=1" are still considered different.
+    """
+
+    parsed: urllib.parse.SplitResult
+    query: Dict[str, List[str]]
+    fragment: Dict[str, List[str]]
+
+    @classmethod
+    def from_link(cls, link: Link) -> "_CleanResult":
+        parsed = link._parsed_url
+        netloc = parsed.netloc.rsplit("@", 1)[-1]
+        fragment = urllib.parse.parse_qs(parsed.fragment)
+        if fragment.pop("egg", None):
+            logger.debug("Ignoring egg= fragment in %s", link)
+        return _CleanResult(
+            parsed=parsed._replace(netloc=netloc, query="", fragment=""),
+            query=urllib.parse.parse_qs(parsed.query),
+            fragment=fragment,
+        )
+
+
+@functools.lru_cache(maxsize=None)
 def links_equivalent(link1, link2):
     # type: (Link, Link) -> bool
-    return link1 == link2
+    return _CleanResult.from_link(link1) == _CleanResult.from_link(link2)
diff --git a/tests/functional/test_install_reqs.py b/tests/functional/test_install_reqs.py
@@ -423,25 +423,16 @@ def test_constraints_constrain_to_local(script, data, resolver_variant):
     assert 'Running setup.py install for singlemodule' in result.stdout
 
 
-def test_constrained_to_url_install_same_url(script, data, resolver_variant):
+def test_constrained_to_url_install_same_url(script, data):
     to_install = data.src.joinpath("singlemodule")
     constraints = path_to_url(to_install) + "#egg=singlemodule"
     script.scratch_path.joinpath("constraints.txt").write_text(constraints)
     result = script.pip(
         'install', '--no-index', '-f', data.find_links, '-c',
         script.scratch_path / 'constraints.txt', to_install,
         allow_stderr_warning=True,
-        expect_error=(resolver_variant == "2020-resolver"),
     )
-    if resolver_variant == "2020-resolver":
-        assert 'Cannot install singlemodule 0.0.1' in result.stderr, str(result)
-        assert (
-            'because these package versions have conflicting dependencies.'
-            in result.stderr
-        ), str(result)
-    else:
-        assert ('Running setup.py install for singlemodule'
-                in result.stdout), str(result)
+    assert 'Running setup.py install for singlemodule' in result.stdout, str(result)
 
 
 def test_double_install_spurious_hash_mismatch(
diff --git a/tests/functional/test_new_resolver.py b/tests/functional/test_new_resolver.py
@@ -1791,6 +1791,96 @@ def test_new_resolver_avoids_incompatible_wheel_tags_in_constraint_url(
     assert_not_installed(script, "dep")
 
 
+@pytest.mark.parametrize(
+    "suffixes_equivalent, depend_suffix, request_suffix",
+    [
+        pytest.param(
+            True,
+            "#egg=foo",
+            "",
+            id="drop-depend-egg",
+        ),
+        pytest.param(
+            True,
+            "",
+            "#egg=foo",
+            id="drop-request-egg",
+        ),
+        pytest.param(
+            True,
+            "#subdirectory=bar&egg=foo",
+            "#subdirectory=bar&egg=bar",
+            id="drop-egg-only",
+        ),
+        pytest.param(
+            True,
+            "#subdirectory=bar&egg=foo",
+            "#egg=foo&subdirectory=bar",
+            id="fragment-ordering",
+        ),
+        pytest.param(
+            True,
+            "?a=1&b=2",
+            "?b=2&a=1",
+            id="query-opordering",
+        ),
+        pytest.param(
+            False,
+            "#sha512=1234567890abcdef",
+            "#sha512=abcdef1234567890",
+            id="different-keys",
+        ),
+        pytest.param(
+            False,
+            "#sha512=1234567890abcdef",
+            "#md5=1234567890abcdef",
+            id="different-values",
+        ),
+        pytest.param(
+            False,
+            "#subdirectory=bar&egg=foo",
+            "#subdirectory=rex",
+            id="drop-egg-still-different",
+        ),
+    ],
+)
+def test_new_resolver_direct_url_equivalent(
+    tmp_path,
+    script,
+    suffixes_equivalent,
+    depend_suffix,
+    request_suffix,
+):
+    pkga = create_basic_wheel_for_package(script, name="pkga", version="1")
+    pkgb = create_basic_wheel_for_package(
+        script,
+        name="pkgb",
+        version="1",
+        depends=[f"pkga@{path_to_url(pkga)}{depend_suffix}"],
+    )
+
+    # Make pkgb visible via --find-links, but not pkga.
+    find_links = tmp_path.joinpath("find_links")
+    find_links.mkdir()
+    with open(pkgb, "rb") as f:
+        find_links.joinpath(pkgb.name).write_bytes(f.read())
+
+    # Install pkgb from --find-links, and pkga directly but from a different
+    # URL suffix as specified in pkgb. This should work!
+    script.pip(
+        "install",
+        "--no-cache-dir", "--no-index",
+        "--find-links", str(find_links),
+        f"{path_to_url(pkga)}{request_suffix}", "pkgb",
+        expect_error=(not suffixes_equivalent),
+    )
+
+    if suffixes_equivalent:
+        assert_installed(script, pkga="1", pkgb="1")
+    else:
+        assert_not_installed(script, "pkga", "pkgb")
+
+
 def test_new_resolver_direct_url_with_extras(tmp_path, script):
     pkg1 = create_basic_wheel_for_package(script, name="pkg1", version="1")
     pkg2 = create_basic_wheel_for_package(
diff --git a/tests/unit/test_link.py b/tests/unit/test_link.py
@@ -1,6 +1,6 @@
 import pytest
 
-from pip._internal.models.link import Link
+from pip._internal.models.link import Link, links_equivalent
 from pip._internal.utils.hashes import Hashes
 
 
@@ -138,3 +138,56 @@ def test_is_hash_allowed__none_hashes(self, hashes, expected):
     def test_is_vcs(self, url, expected):
         link = Link(url)
         assert link.is_vcs is expected
+
+
+@pytest.mark.parametrize(
+    "url1, url2",
+    [
+        pytest.param(
+            "https://example.com/foo#egg=foo",
+            "https://example.com/foo",
+            id="drop-egg",
+        ),
+        pytest.param(
+            "https://example.com/foo#subdirectory=bar&egg=foo",
+            "https://example.com/foo#subdirectory=bar&egg=bar",
+            id="drop-egg-only",
+        ),
+        pytest.param(
+            "https://example.com/foo#subdirectory=bar&egg=foo",
+            "https://example.com/foo#egg=foo&subdirectory=bar",
+            id="fragment-ordering",
+        ),
+        pytest.param(
+            "https://example.com/foo?a=1&b=2",
+            "https://example.com/foo?b=2&a=1",
+            id="query-opordering",
+        ),
+    ],
+)
+def test_links_equivalent(url1, url2):
+    assert links_equivalent(Link(url1), Link(url2))
+
+
+@pytest.mark.parametrize(
+    "url1, url2",
+    [
+        pytest.param(
+            "https://example.com/foo#sha512=1234567890abcdef",
+            "https://example.com/foo#sha512=abcdef1234567890",
+            id="different-keys",
+        ),
+        pytest.param(
+            "https://example.com/foo#sha512=1234567890abcdef",
+            "https://example.com/foo#md5=1234567890abcdef",
+            id="different-values",
+        ),
+        pytest.param(
+            "https://example.com/foo#subdirectory=bar&egg=foo",
+            "https://example.com/foo#subdirectory=rex",
+            id="drop-egg-still-different",
+        ),
+    ],
+)
+def test_links_equivalent_false(url1, url2):
+    assert not links_equivalent(Link(url1), Link(url2))