Fix ID extraction crash when regex groups are optional (#2572)

* Fix ID extraction crash when regex groups are optional Handle None capture groups in username/id extraction and add regression coverage for optional trailing groups. * Remove leftover line that overwrote safe _id in extract_id_from_url
2026-05-06 14:08:59 +00:00 · 2026-04-30 19:14:40 -03:00
parent 533884bad5
commit 9dbefcef11
2 changed files with 37 additions and 3 deletions
@@ -181,7 +181,15 @@ class MaigretSite:
        if self.url_regexp:
            match_groups = self.url_regexp.match(url)
            if match_groups:
-                return match_groups.groups()[-1].rstrip("/")
+                username = next(
+                    (
+                        group.rstrip("/")
+                        for group in reversed(match_groups.groups())
+                        if isinstance(group, str) and group
+                    ),
+                    None,
+                )
+                return username

        return None

@@ -196,8 +204,16 @@ class MaigretSite:
        match_groups = self.url_regexp.match(url)
        if not match_groups:
            return None
-
-        _id = match_groups.groups()[-1].rstrip("/")
+        _id = next(
+            (
+                group.rstrip("/")
+                for group in reversed(match_groups.groups())
+                if isinstance(group, str) and group
+            ),
+            None,
+        )
+        if _id is None:
+            return None
        _type = self.type

        return _id, _type
@@ -1,5 +1,7 @@
 """Maigret Database test functions"""

+import re
+
 from typing import Any, Dict

 from maigret.sites import MaigretDatabase, MaigretSite
@@ -126,6 +128,22 @@ def test_site_url_detector():
    )


+def test_extract_id_from_url_skips_none_groups():
+    site = MaigretSite(
+        "Example",
+        {
+            "urlMain": "https://example.com",
+            "url": "https://example.com/{username}",
+        },
+    )
+    site.url_regexp = re.compile(r"^https://example\.com/([^/?#]+)(?:/(.*))?$")
+
+    assert site.extract_id_from_url("https://example.com/username") == (
+        "username",
+        "username",
+    )
+
+
 def test_ranked_sites_dict():
    db = MaigretDatabase()
    db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'}))