Smarter (and looser) link equivalency logic

uranusjr · uranusjr · commit c55d17c8cde2 · 2021-06-19T03:29:44.000+08:00
diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py
@@ -2,7 +2,7 @@
 import posixpath
 import re
 import urllib.parse
-from typing import TYPE_CHECKING, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, NamedTuple, Optional, Tuple, Union
 
 from pip._internal.utils.filetypes import WHEEL_EXTENSION
 from pip._internal.utils.hashes import Hashes
@@ -242,7 +242,50 @@ def is_hash_allowed(self, hashes):
         return hashes.is_hash_allowed(self.hash_name, hex_digest=self.hash)
 
 
-# TODO: Relax this comparison logic to ignore, for example, fragments.
+class _CleanResult(NamedTuple):
+    """Convert link for equivalency check.
+
+    This is used in the resolver to check whether two URL-specified requirements
+    likely point to the same distribution and can be considered equivalent. This
+    equivalency logic avoids comparing URLs literally, which can be too strict
+    (e.g. "a=1&b=2" vs "b=2&a=1") and produce conflicts unexpecting to users.
+
+    Currently this does three things:
+
+    1. Drop the basic auth part. This is technically wrong since a server can
+       serve different content based on auth, but if it does that, it is even
+       impossible to guarantee two URLs without auth are equivalent, since
+       the user can input different auth information when prompted. So the
+       practical solution is to assume the auth doesn't affect the response.
+    2. Parse the query to avoid the ordering issue.
+    3. Parse the fragment, and explicitly drop the "egg=" part since it is
+       commonly provided as the project name for compatibility. This is wrong in
+       the strictest sense, but too many people are doing it.
+
+    Note that query value ordering under the same key in query and fragment are
+    NOT cleaned; i.e. "a=1&a=2" and "a=2&a=1" are still considered different.
+    """
+
+    parsed: urllib.parse.SplitResult
+    query: Dict[str, List[str]]
+    fragment: Dict[str, List[str]]
+
+    @classmethod
+    def from_link(cls, link: Link) -> "_CleanResult":
+        parsed = link._parsed_url
+        netloc = parsed.netloc.rsplit("@", 1)[-1]
+        # The fragment does not necessarily use the query string format
+        # (it's a pip-specific syntax), so we set keep_blank_values to keep
+        # a fragment that's not a key-value pair (e.g. "#title_1").
+        frag_qs = urllib.parse.parse_qs(parsed.fragment, keep_blank_values=True)
+        frag_qs.pop("egg", None)
+        return _CleanResult(
+            parsed=parsed._replace(netloc=netloc, query="", fragment=""),
+            query=urllib.parse.parse_qs(parsed.query),
+            fragment=frag_qs,
+        )
+
+
 def links_equivalent(link1, link2):
     # type: (Link, Link) -> bool
-    return link1 == link2
+    return _CleanResult.from_link(link1) == _CleanResult.from_link(link2)