Skip to content

Commit

Permalink
perf: Avoid unnecessary URL processing while parsing links
Browse files Browse the repository at this point in the history
There are three optimizations in this commit, in descending order of
impact:

- If the file URL in the "project detail" response is already absolute,
  then avoid calling urljoin() as it's expensive (mostly because it
  calls urlparse() on both of its URL arguments) and does nothing. While
  it'd be more correct to check whether the file URL has a scheme, we'd
  need to parse the URL which is what we're trying to avoid in the first
  place. Anyway, by simply checking if the URL starts with http[s]://,
  we can avoid slow urljoin() calls for PyPI responses.

- Replacing urllib.parse.urlparse() with urllib.parse.urlsplit() in
  _ensure_quoted_url(). The URL parsing functions are equivalent for our
  needs[^1]. However, urlsplit() isfaster, and we achieve better cache
  utilization of its internal cache if we call it directly[^2].

- Calculating the Link.path property in advance as it's very hot.

[^1]: we don't care about URL parameters AFAIK (which are different than
  the query component!)

[^2]: urlparse() calls urlsplit() internally, but it passes the authority
  parameter (unlike any of our calls) so it bypasses the cache.
  • Loading branch information
ichard26 committed Dec 27, 2024
1 parent c10dda5 commit 83f9a08
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 6 deletions.
1 change: 1 addition & 0 deletions news/13132.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Optimize package collection by avoiding unnecessary URL parsing and other processing.
33 changes: 27 additions & 6 deletions src/pip/_internal/models/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,30 @@ def _ensure_quoted_url(url: str) -> str:
and without double-quoting other characters.
"""
# Split the URL into parts according to the general structure
# `scheme://netloc/path;parameters?query#fragment`.
result = urllib.parse.urlparse(url)
# `scheme://netloc/path?query#fragment`.
result = urllib.parse.urlsplit(url)
# If the netloc is empty, then the URL refers to a local filesystem path.
is_local_path = not result.netloc
path = _clean_url_path(result.path, is_local_path=is_local_path)
return urllib.parse.urlunparse(result._replace(path=path))
return urllib.parse.urlunsplit(result._replace(path=path))


def _absolute_link_url(base_url: str, file_url: str) -> str:
"""Return an absolute file URL from a simple response.
If the file URL is already absolute, the function does nothing.
If the file URL is relative, it is joined with the base URL.
"""
# If the file URL is already absolute, joining it with the page URL
# will do nothing. PyPI returns absolute URLs so we can avoid expensive
# urljoin() calls in the common case. (It would be technically more
# correct to parse the file URL and check if it has a scheme, but the
# slow URL parsing urljoin() does is what we're trying to avoid in the
# first place, so we only check for the http[s]:// prefix.)
if file_url.startswith(("https://", "http://")):
return file_url
else:
return urllib.parse.urljoin(base_url, file_url)


@functools.total_ordering
Expand All @@ -185,6 +203,7 @@ class Link:
__slots__ = [
"_parsed_url",
"_url",
"_path",
"_hashes",
"comes_from",
"requires_python",
Expand Down Expand Up @@ -241,6 +260,8 @@ def __init__(
# Store the url as a private attribute to prevent accidentally
# trying to set a new value.
self._url = url
# The .path property is hot, so calculate its value ahead of time.
self._path = urllib.parse.unquote(self._parsed_url.path)

link_hash = LinkHash.find_hash_url_fragment(url)
hashes_from_link = {} if link_hash is None else link_hash.as_dict()
Expand Down Expand Up @@ -270,7 +291,7 @@ def from_json(
if file_url is None:
return None

url = _ensure_quoted_url(urllib.parse.urljoin(page_url, file_url))
url = _ensure_quoted_url(_absolute_link_url(page_url, file_url))
pyrequire = file_data.get("requires-python")
yanked_reason = file_data.get("yanked")
hashes = file_data.get("hashes", {})
Expand Down Expand Up @@ -322,7 +343,7 @@ def from_element(
if not href:
return None

url = _ensure_quoted_url(urllib.parse.urljoin(base_url, href))
url = _ensure_quoted_url(_absolute_link_url(base_url, href))
pyrequire = anchor_attribs.get("data-requires-python")
yanked_reason = anchor_attribs.get("data-yanked")

Expand Down Expand Up @@ -421,7 +442,7 @@ def netloc(self) -> str:

@property
def path(self) -> str:
return urllib.parse.unquote(self._parsed_url.path)
return self._path

def splitext(self) -> Tuple[str, str]:
return splitext(posixpath.basename(self.path.rstrip("/")))
Expand Down

0 comments on commit 83f9a08

Please sign in to comment.