From a0be5e54d3ac504d0e7943666377e2f5c0c18f86 Mon Sep 17 00:00:00 2001 From: AA Turner <9087854+AA-Turner@users.noreply.github.com> Date: Tue, 20 Apr 2021 04:37:04 +0100 Subject: [PATCH 1/5] Add pep_rss_gen.py --- pep_rss_gen.py | 137 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 pep_rss_gen.py diff --git a/pep_rss_gen.py b/pep_rss_gen.py new file mode 100644 index 00000000000..9613c4699dc --- /dev/null +++ b/pep_rss_gen.py @@ -0,0 +1,137 @@ +import datetime +import email.utils +from pathlib import Path +import re + +from dateutil import parser +import docutils.frontend +import docutils.nodes +import docutils.parsers.rst +import docutils.utils +from feedgen import entry +from feedgen import feed + + +# Monkeypatch feedgen.util.formatRFC2822 +def _format_rfc_2822(dt: datetime.datetime) -> str: + return email.utils.format_datetime(dt, usegmt=True) + + +entry.formatRFC2822 = feed.formatRFC2822 = _format_rfc_2822 +line_cache: dict[Path, dict[str, str]] = {} + + +def first_line_starting_with(full_path: Path, text: str) -> str: + # Try and retrieve from cache + if full_path in line_cache: + return line_cache[full_path].get(text, "") + + # Else read source + line_cache[full_path] = path_cache = {} + for line in full_path.open(encoding="utf-8"): + if line.startswith("Created:"): + path_cache["Created:"] = line.removeprefix("Created:").strip() + elif line.startswith("Title:"): + path_cache["Title:"] = line.removeprefix("Title:").strip() + elif line.startswith("Author:"): + path_cache["Author:"] = line.removeprefix("Author:").strip() + + # Once all have been found, exit loop + if path_cache.keys == {"Created:", "Title:", "Author:"}: + break + return path_cache.get(text, "") + + +def pep_creation(full_path: Path) -> datetime.datetime: + created_str = first_line_starting_with(full_path, "Created:") + # bleh, I was hoping to avoid re but some PEPs editorialize on the Created line + # (note as of Aug 2020 only PEP 102 has additional content on the Created line) + m = re.search(r"(\d+[- ][\w\d]+[- ]\d{2,4})", created_str) + if not m: + # some older ones have an empty line, that's okay, if it's old we ipso facto don't care about it. + # "return None" would make the most sense but datetime objects refuse to compare with that. :-| + return datetime.datetime(1900, 1, 1) + created_str = m.group(1) + try: + return parser.parse(created_str, dayfirst=True) + except (ValueError, OverflowError): + return datetime.datetime(1900, 1, 1) + + +def parse_rst(text: str) -> docutils.nodes.document: + rst_parser = docutils.parsers.rst.Parser() + components = (docutils.parsers.rst.Parser,) + settings = docutils.frontend.OptionParser(components=components).get_default_values() + document = docutils.utils.new_document('', settings=settings) + rst_parser.parse(text, document) + return document + + +def pep_abstract(full_path: Path) -> str: + """Return the first paragraph of the PEP abstract""" + text = full_path.read_text(encoding="utf-8") + for node in parse_rst(text): + if "Abstract" in str(node): + for child in node: + if child.tagname == "paragraph": + return child.astext().strip().replace("\n", " ") + return "" + + +def main(): + # get the directory with the PEP sources + pep_dir = Path(__file__).parent + + # get list of peps with creation time (from "Created:" string in pep source) + peps_with_dt = sorted((pep_creation(path), path) for path in pep_dir.glob("pep-????.*")) + + # generate rss items for 10 most recent peps + items = [] + for dt, full_path in peps_with_dt[-10:]: + try: + pep_num = int(full_path.stem.split("-")[-1]) + except ValueError: + continue + + title = first_line_starting_with(full_path, "Title:") + author = first_line_starting_with(full_path, "Author:") + parsed_authors = email.utils.getaddresses([author]) if "@" in author else [(author, "")] + url = f"https://www.python.org/dev/peps/pep-{pep_num:0>4}" + + item = entry.FeedEntry() + item.title(f"PEP {pep_num}: {title}") + item.link(href=url) + item.description(pep_abstract(full_path)) + item.guid(url, permalink=True) + item.published(dt.replace(tzinfo=datetime.timezone.utc)) # ensure datetime has a timezone + item.author([dict(name=parsed_author[0], email=parsed_author[1]) for parsed_author in parsed_authors]) + items.append(item) + + # The rss envelope + desc = """ + Newest Python Enhancement Proposals (PEPs) - Information on new + language features, and some meta-information like release + procedure and schedules. + """.replace("\n ", " ").strip() + + # Setup feed generator + fg = feed.FeedGenerator() + fg.language("en") + fg.generator("") + fg.docs("https://cyber.harvard.edu/rss/rss.html") + + # Add metadata + fg.title("Newest Python PEPs") + fg.link(href="https://www.python.org/dev/peps") + fg.description(desc) + fg.lastBuildDate(datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)) + + # Add PEP information (ordered by newest first) + for item in items: + fg.add_entry(item) + + pep_dir.joinpath("peps.rss").write_bytes(fg.rss_str(pretty=True)) + + +if __name__ == "__main__": + main() From add7f45838a19917999bd4ef2763cdaa3a3d7d64 Mon Sep 17 00:00:00 2001 From: AA Turner <9087854+AA-Turner@users.noreply.github.com> Date: Fri, 7 May 2021 14:27:56 +0100 Subject: [PATCH 2/5] Add new RSS target --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index eebb2d613ef..0f201b0c04a 100644 --- a/Makefile +++ b/Makefile @@ -60,7 +60,11 @@ lint: SPHINX_JOBS=8 SPHINX_BUILD=$(PYTHON) build.py -j $(SPHINX_JOBS) -pages: rss +# TODO replace `rss:` with this when merged & tested +pep_rss: + $(PYTHON) pep_rss_gen.py + +pages: pep_rss $(SPHINX_BUILD) --index-file sphinx: From 0d5e6ec6b87bc75058f9b7f9c7b50c8e3fbf3747 Mon Sep 17 00:00:00 2001 From: AA Turner <9087854+AA-Turner@users.noreply.github.com> Date: Wed, 9 Jun 2021 00:13:01 +0100 Subject: [PATCH 3/5] Add RSS dependencies --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index 99202ceca35..837f41b3ef7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,6 @@ # Requirements for building PEPs with Sphinx sphinx >= 3.5 docutils >= 0.16 + +# For RSS +feedgen >= 0.9.0 # For RSS feed From 45e295488e5fd99d03450c74f596ef12c6d8f418 Mon Sep 17 00:00:00 2001 From: AA Turner <9087854+AA-Turner@users.noreply.github.com> Date: Sat, 12 Jun 2021 20:08:04 +0100 Subject: [PATCH 4/5] Ensure that there is only ever one author element in the RSS document --- pep_rss_gen.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pep_rss_gen.py b/pep_rss_gen.py index 9613c4699dc..21ede194012 100644 --- a/pep_rss_gen.py +++ b/pep_rss_gen.py @@ -95,7 +95,14 @@ def main(): title = first_line_starting_with(full_path, "Title:") author = first_line_starting_with(full_path, "Author:") - parsed_authors = email.utils.getaddresses([author]) if "@" in author else [(author, "")] + if "@" in author or " at " in author: + parsed_authors = email.utils.getaddresses([author]) + # ideal would be to pass as a list of dicts with names and emails to + # item.author, but FeedGen's RSS output doesn't pass W3C + # validation (as of 12/06/2021) + joined_authors = ", ".join(f"{name} ({email_address})" for name, email_address in parsed_authors) + else: + joined_authors = author url = f"https://www.python.org/dev/peps/pep-{pep_num:0>4}" item = entry.FeedEntry() @@ -104,7 +111,7 @@ def main(): item.description(pep_abstract(full_path)) item.guid(url, permalink=True) item.published(dt.replace(tzinfo=datetime.timezone.utc)) # ensure datetime has a timezone - item.author([dict(name=parsed_author[0], email=parsed_author[1]) for parsed_author in parsed_authors]) + item.author(email=joined_authors) items.append(item) # The rss envelope From a8b01b9ea5ce847963a1e384e105e4923c0af0c9 Mon Sep 17 00:00:00 2001 From: AA Turner <9087854+AA-Turner@users.noreply.github.com> Date: Sat, 12 Jun 2021 20:08:15 +0100 Subject: [PATCH 5/5] Add self link to RSS feed --- pep_rss_gen.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pep_rss_gen.py b/pep_rss_gen.py index 21ede194012..a06ffd20caa 100644 --- a/pep_rss_gen.py +++ b/pep_rss_gen.py @@ -130,6 +130,7 @@ def main(): # Add metadata fg.title("Newest Python PEPs") fg.link(href="https://www.python.org/dev/peps") + fg.link(href="https://www.python.org/dev/peps/peps.rss", rel="self") fg.description(desc) fg.lastBuildDate(datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc))