From 744ce6e0d0ee424147e9c0e83571037575896ee6 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 18 Mar 2026 11:06:57 +0000 Subject: [PATCH 1/6] fix: exclude YouTube from Mastodon detection, add YouTube as conference link YouTube URLs with /@channel patterns were incorrectly matched as Mastodon profiles. This fixes the extractor to properly identify YouTube links and adds YouTube as a first-class conference field for enrichment and display. - Fix extract_links_from_url to detect YouTube before generic /@username - Add youtube field to Conference schema, validation, and data model - Add YouTube display on conference detail pages - Add tests for YouTube extraction and Mastodon disambiguation https://claude.ai/code/session_0154a8RdG7M2nj83zPWVodgZ --- _includes/head.html | 1 + _layouts/conference.html | 6 ++ tests/test_youtube_extraction.py | 99 ++++++++++++++++++++++++++++++++ utils/enrich_tba.py | 15 ++++- utils/schema.yml | 1 + utils/tidy_conf/schema.py | 3 +- utils/tidy_conf/validation.py | 1 + 7 files changed, 123 insertions(+), 3 deletions(-) create mode 100644 tests/test_youtube_extraction.py diff --git a/_includes/head.html b/_includes/head.html index 46e20587854..5fa233c0f8f 100644 --- a/_includes/head.html +++ b/_includes/head.html @@ -89,6 +89,7 @@ twitter: {{ conf.twitter | jsonify }}, mastodon: {{ conf.mastodon | jsonify }}, bluesky: {{ conf.bluesky | jsonify }}, + youtube: {{ conf.youtube | jsonify }}, location: {{ conf.location | jsonify }}, extra_places: {{ conf.extra_places | jsonify }}, workshop_deadline: {{ conf.workshop_deadline | jsonify }}, diff --git a/_layouts/conference.html b/_layouts/conference.html index 1865c52fd4e..8b99898e43f 100644 --- a/_layouts/conference.html +++ b/_layouts/conference.html @@ -162,6 +162,12 @@

a.k.a. {{page.alt_name}} {{page.year}}

Mastodon {% endif %} + {% if page.youtube %} +
+ + YouTube +
+ {% endif %}
diff --git a/tests/test_youtube_extraction.py b/tests/test_youtube_extraction.py new file mode 100644 index 00000000000..87da528d1fa --- /dev/null +++ b/tests/test_youtube_extraction.py @@ -0,0 +1,99 @@ +"""Tests for YouTube link extraction and Mastodon/YouTube disambiguation.""" + +import sys +from pathlib import Path +from unittest.mock import patch + +sys.path.append(str(Path(__file__).parent.parent / "utils")) + +from enrich_tba import extract_links_from_url + + +class TestYouTubeExtraction: + """Test YouTube link detection in extract_links_from_url.""" + + @patch("enrich_tba.get_all_links") + def test_youtube_channel_detected(self, mock_links): + """YouTube /@channel links are detected as youtube, not mastodon.""" + mock_links.return_value = [ + "https://www.youtube.com/@PyConUS", + ] + result = extract_links_from_url("https://pycon.org") + assert "youtube" in result + assert result["youtube"] == "https://www.youtube.com/@PyConUS" + assert "mastodon" not in result + + @patch("enrich_tba.get_all_links") + def test_youtube_channel_url_without_at(self, mock_links): + """YouTube channel links without @ are detected.""" + mock_links.return_value = [ + "https://www.youtube.com/channel/UCMjMBMGt0WP2usFilILnbcA", + ] + result = extract_links_from_url("https://pycon.org") + assert "youtube" in result + assert "mastodon" not in result + + @patch("enrich_tba.get_all_links") + def test_youtube_not_mistaken_for_mastodon(self, mock_links): + """YouTube /@username must not end up in mastodon field.""" + mock_links.return_value = [ + "https://www.youtube.com/@EuroPython", + "https://fosstodon.org/@europython", + ] + result = extract_links_from_url("https://europython.eu") + assert result.get("youtube") == "https://www.youtube.com/@EuroPython" + assert result.get("mastodon") == "https://fosstodon.org/@europython" + + @patch("enrich_tba.get_all_links") + def test_youtu_be_short_link(self, mock_links): + """Short youtu.be links are detected as youtube.""" + mock_links.return_value = [ + "https://youtu.be/abc123", + ] + result = extract_links_from_url("https://pycon.org") + assert "youtube" in result + assert "mastodon" not in result + + @patch("enrich_tba.get_all_links") + def test_mastodon_still_works(self, mock_links): + """Mastodon links on known instances still detected correctly.""" + mock_links.return_value = [ + "https://fosstodon.org/@pycon", + ] + result = extract_links_from_url("https://pycon.org") + assert "mastodon" in result + assert result["mastodon"] == "https://fosstodon.org/@pycon" + assert "youtube" not in result + + @patch("enrich_tba.get_all_links") + def test_generic_mastodon_still_works(self, mock_links): + """Generic /@username on unknown instances still detected as mastodon.""" + mock_links.return_value = [ + "https://social.example.org/@pyconf", + ] + result = extract_links_from_url("https://pyconf.org") + assert "mastodon" in result + assert "youtube" not in result + + @patch("enrich_tba.get_all_links") + def test_youtube_first_seen_wins(self, mock_links): + """Only the first YouTube link is kept.""" + mock_links.return_value = [ + "https://www.youtube.com/@PyConUS", + "https://www.youtube.com/@AnotherChannel", + ] + result = extract_links_from_url("https://pycon.org") + assert result["youtube"] == "https://www.youtube.com/@PyConUS" + + @patch("enrich_tba.get_all_links") + def test_all_social_links_extracted(self, mock_links): + """YouTube, Mastodon, and Bluesky can all be extracted together.""" + mock_links.return_value = [ + "https://bsky.app/profile/pycon.org", + "https://www.youtube.com/@PyConUS", + "https://fosstodon.org/@pycon", + ] + result = extract_links_from_url("https://pycon.org") + assert "bluesky" in result + assert "youtube" in result + assert "mastodon" in result diff --git a/utils/enrich_tba.py b/utils/enrich_tba.py index b6fa66cbe6f..f84dc0defb2 100644 --- a/utils/enrich_tba.py +++ b/utils/enrich_tba.py @@ -50,7 +50,7 @@ MAX_CONTENT_LENGTH = 15000 # Max characters per conference website # Field type categorization for validation -URL_FIELDS = {"sponsor", "finaid", "mastodon", "bluesky", "cfp_link"} +URL_FIELDS = {"sponsor", "finaid", "mastodon", "bluesky", "youtube", "cfp_link"} DATE_FIELDS = {"cfp", "workshop_deadline", "tutorial_deadline"} TIMEZONE_FIELD = "timezone" @@ -341,14 +341,25 @@ def extract_links_from_url(url: str) -> dict[str, str]: seen_types.add("bluesky") logger.debug(f" Found bluesky: {link}") + # YouTube - youtube.com/@channel or youtu.be links + elif "youtube" not in seen_types and ("youtube.com" in link_lower or "youtu.be" in link_lower): + domain = parsed_link.netloc.lower() + if "youtube.com" in domain or "youtu.be" in domain: + found["youtube"] = link + seen_types.add("youtube") + logger.debug(f" Found youtube: {link}") + # Mastodon - /@username pattern on known instances or any instance - # Exclude Twitter/X which don't use /@, but guard against edge cases + # Exclude Twitter/X and YouTube which also use /@username patterns elif "mastodon" not in seen_types and "/@" in link: domain = parsed_link.netloc.lower() # Skip Twitter/X domains (exact host or subdomains only) if domain == "twitter.com" or domain.endswith((".x.com", ".twitter.com")) or domain == "x.com": pass + # Skip YouTube domains + elif "youtube.com" in domain or "youtu.be" in domain: + pass elif domain in MASTODON_INSTANCES or "mastodon" in domain or "toot" in domain: found["mastodon"] = link seen_types.add("mastodon") diff --git a/utils/schema.yml b/utils/schema.yml index 9ad7e5b7827..94438152025 100644 --- a/utils/schema.yml +++ b/utils/schema.yml @@ -18,6 +18,7 @@ twitter: BestConfEver # Twitter handle of conference (Optional) mastodon: https://mastodon.social/@bconf # Mastodon handle of conference (Optional) bluesky: https://bsky.app/@bconf # Bluesky handle of conference (Optional) + youtube: https://www.youtube.com/@bconf # YouTube channel of conference (Optional) sub: PY # Type of conference (see or add _data/types.yml) note: Important # In case there are extra notes about the conference (Optional) location: # Geolocation for inclusion in map diff --git a/utils/tidy_conf/schema.py b/utils/tidy_conf/schema.py index 11885c87365..ea8e391b78a 100644 --- a/utils/tidy_conf/schema.py +++ b/utils/tidy_conf/schema.py @@ -72,6 +72,7 @@ class Conference(BaseModel): twitter: str | None = None mastodon: HttpUrl | None = None bluesky: str | None = None + youtube: HttpUrl | None = None sub: str note: str | None = None location: list[Location] | None = None @@ -121,7 +122,7 @@ def validate_title(cls, v): return re.sub(r"\b(19|20)\d{2}\b", "", v).strip() return v - @field_serializer("link", "cfp_link", "sponsor", "finaid", "mastodon") + @field_serializer("link", "cfp_link", "sponsor", "finaid", "mastodon", "youtube") def ser_url(self, value): return str(value) diff --git a/utils/tidy_conf/validation.py b/utils/tidy_conf/validation.py index 84cab7a1afa..46a9dd1ee11 100644 --- a/utils/tidy_conf/validation.py +++ b/utils/tidy_conf/validation.py @@ -33,6 +33,7 @@ "twitter", "mastodon", "bluesky", + "youtube", "location", "extra_places", ] From 2c42cd271b148163c86824535fa48c83c8a1d5a5 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 18 Mar 2026 19:27:15 +0000 Subject: [PATCH 2/6] feat: add Bluesky and YouTube display to conference templates Bluesky and YouTube data was being tracked but never shown to users. Add display links to conference detail pages, summary pages, and the index listing row using Font Awesome brand icons. https://claude.ai/code/session_0154a8RdG7M2nj83zPWVodgZ --- _includes/index_conf_title_row.html | 3 +++ _layouts/conference.html | 6 ++++++ _layouts/summary.html | 12 ++++++++++++ 3 files changed, 21 insertions(+) diff --git a/_includes/index_conf_title_row.html b/_includes/index_conf_title_row.html index 765dd9b7da5..0400e4b1d99 100644 --- a/_includes/index_conf_title_row.html +++ b/_includes/index_conf_title_row.html @@ -23,6 +23,9 @@ {% elsif conf.twitter %} Twitter {% endif %} + {% if conf.bluesky %} + + {% endif %}
diff --git a/_layouts/conference.html b/_layouts/conference.html index 8b99898e43f..50959ba31df 100644 --- a/_layouts/conference.html +++ b/_layouts/conference.html @@ -162,6 +162,12 @@

a.k.a. {{page.alt_name}} {{page.year}}

Mastodon {% endif %} + {% if page.bluesky %} +
+ + Bluesky +
+ {% endif %} {% if page.youtube %}
diff --git a/_layouts/summary.html b/_layouts/summary.html index 80dac2620fd..9f796ca5568 100644 --- a/_layouts/summary.html +++ b/_layouts/summary.html @@ -71,6 +71,18 @@

Mastodon

{% endif %} + {% if confs[0].bluesky %} +
+ + Bluesky +
+ {% endif %} + {% if confs[0].youtube %} +
+ + YouTube +
+ {% endif %}
From 9feec7e1f4f3eea05477e23117276947044a89c6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 18 Apr 2026 17:57:55 +0000 Subject: [PATCH 3/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utils/enrich_tba.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/utils/enrich_tba.py b/utils/enrich_tba.py index f84dc0defb2..97348db5676 100644 --- a/utils/enrich_tba.py +++ b/utils/enrich_tba.py @@ -355,10 +355,7 @@ def extract_links_from_url(url: str) -> dict[str, str]: domain = parsed_link.netloc.lower() # Skip Twitter/X domains (exact host or subdomains only) - if domain == "twitter.com" or domain.endswith((".x.com", ".twitter.com")) or domain == "x.com": - pass - # Skip YouTube domains - elif "youtube.com" in domain or "youtu.be" in domain: + if domain == "twitter.com" or domain.endswith((".x.com", ".twitter.com")) or domain == "x.com" or ("youtube.com" in domain or "youtu.be" in domain): pass elif domain in MASTODON_INSTANCES or "mastodon" in domain or "toot" in domain: found["mastodon"] = link From 162db0f176514f536390e94985ae9704069e62d3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 18 Apr 2026 18:00:05 +0000 Subject: [PATCH 4/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- utils/enrich_tba.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/utils/enrich_tba.py b/utils/enrich_tba.py index 076b4d001c7..de30995a346 100644 --- a/utils/enrich_tba.py +++ b/utils/enrich_tba.py @@ -355,7 +355,12 @@ def extract_links_from_url(url: str) -> dict[str, str]: domain = parsed_link.netloc.lower() # Skip Twitter/X domains (exact host or subdomains only) - if domain == "twitter.com" or domain.endswith((".x.com", ".twitter.com")) or domain == "x.com" or ("youtube.com" in domain or "youtu.be" in domain): + if ( + domain == "twitter.com" + or domain.endswith((".x.com", ".twitter.com")) + or domain == "x.com" + or ("youtube.com" in domain or "youtu.be" in domain) + ): pass elif domain in MASTODON_INSTANCES or "mastodon" in domain or "toot" in domain: found["mastodon"] = link From 6c9329155995cf213093a42cecd9349574ea2241 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 18 Apr 2026 18:01:42 +0000 Subject: [PATCH 5/6] fix(enrich-tba): use exact domain matching for YouTube/Twitter detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeQL flagged the substring checks ("youtube.com" in domain) as incomplete URL sanitization — a host like evil-youtube.com.attacker could match. Replace with a _domain_matches helper that accepts an exact host or a proper subdomain, and reuse it for Twitter/X. Also collapses the line Ruff E501 on the combined Twitter/YouTube skip condition into a readable form. https://claude.ai/code/session_0154a8RdG7M2nj83zPWVodgZ --- utils/enrich_tba.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/utils/enrich_tba.py b/utils/enrich_tba.py index de30995a346..bed4ec4bcfe 100644 --- a/utils/enrich_tba.py +++ b/utils/enrich_tba.py @@ -267,6 +267,11 @@ def get_all_links(url: str) -> list[str]: return [] +def _domain_matches(domain: str, hosts: tuple[str, ...]) -> bool: + """Return True if domain equals one of hosts or is a subdomain of one.""" + return any(domain == h or domain.endswith(f".{h}") for h in hosts) + + # Known Mastodon instances (common ones in tech/Python community) MASTODON_INSTANCES = { "mastodon.social", @@ -334,6 +339,10 @@ def extract_links_from_url(url: str) -> dict[str, str]: for link in links: link_lower = link.lower() parsed_link = urlparse(link) + link_domain = parsed_link.netloc.lower() + + is_youtube = _domain_matches(link_domain, ("youtube.com", "youtu.be")) + is_twitter = _domain_matches(link_domain, ("twitter.com", "x.com")) # Bluesky - always bsky.app/profile/ if "bluesky" not in seen_types and "bsky.app/profile/" in link_lower: @@ -342,27 +351,18 @@ def extract_links_from_url(url: str) -> dict[str, str]: logger.debug(f" Found bluesky: {link}") # YouTube - youtube.com/@channel or youtu.be links - elif "youtube" not in seen_types and ("youtube.com" in link_lower or "youtu.be" in link_lower): - domain = parsed_link.netloc.lower() - if "youtube.com" in domain or "youtu.be" in domain: - found["youtube"] = link - seen_types.add("youtube") - logger.debug(f" Found youtube: {link}") + elif "youtube" not in seen_types and is_youtube: + found["youtube"] = link + seen_types.add("youtube") + logger.debug(f" Found youtube: {link}") # Mastodon - /@username pattern on known instances or any instance # Exclude Twitter/X and YouTube which also use /@username patterns elif "mastodon" not in seen_types and "/@" in link: - domain = parsed_link.netloc.lower() - - # Skip Twitter/X domains (exact host or subdomains only) - if ( - domain == "twitter.com" - or domain.endswith((".x.com", ".twitter.com")) - or domain == "x.com" - or ("youtube.com" in domain or "youtu.be" in domain) - ): + # Skip Twitter/X and YouTube domains + if is_twitter or is_youtube: pass - elif domain in MASTODON_INSTANCES or "mastodon" in domain or "toot" in domain: + elif link_domain in MASTODON_INSTANCES or "mastodon" in link_domain or "toot" in link_domain: found["mastodon"] = link seen_types.add("mastodon") logger.debug(f" Found mastodon: {link}") From 6d949a78a484e2a09f8e72903080719be2b0e690 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 18 Apr 2026 18:08:04 +0000 Subject: [PATCH 6/6] chore: bump pyupgrade to v3.21.2 for Python 3.14 compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pyupgrade v3.15.2 crashes on Python 3.14 with a TypeError from tokenize.cookie_re — it passes a str where newer CPython expects a bytes pattern. pre-commit.ci runs on Python 3.14, so the hook was failing on every PR regardless of the diff. Bumping to v3.21.2 picks up the upstream fix. https://claude.ai/code/session_0154a8RdG7M2nj83zPWVodgZ --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f94db7a3529..0f0fa0fb603 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -27,7 +27,7 @@ repos: - --force-single-line-imports - --profile black - repo: https://github.com/asottile/pyupgrade # Upgrade Python syntax - rev: v3.15.2 + rev: v3.21.2 hooks: - id: pyupgrade args: