I had a code that was extracting google rss url but that is not working looks like google have changed encoding of the url.
Example url:
“https://news.google.com/rss/articles/CBMiuwFBVV95cUxPNXpRbXdHR3NzWHRlbk40d1I5OFVIajRHUHBXTUFGc3BMd1gxSEU1ZDlocGFWQXZhWEJYakROLUxQcTBZMElHN1VTdlN2eTV2LWFidnhIOHZEVEYwLVhEalpxRFRXeGhXQlZoNEc4d1AzTWR3YUlUZVAybjZWa2c4MU9kLWU2aEtmNlVnRy1OR3ZLcGd1M0NqVjFxeFRaOE9fWExpa1ZxSFpySnRkallHN3dFMm5nU1BIY18w0gHAAUFVX3lxTE5fZDdFSTQwZGVzb3A1eUdIbzNIa0F0RmZlYUFmR3lPVnRZU09QU2hnelFpNngxVXI1aGlYdWE1dzROcTRXSmw3a1dFZ0c1MDNROUU3enYzSFBPaEdpaHZUR0t1V2lpLWt5UEVEY01TbXRvM243U2p4ZTA3MlBtaU9XYmNqMU11QWdUVkFQRmk5RXJRb2Jwa3p0NUptUlpZanpNZVQxaVk2NkJVdU9kRmxlakVPOEx6ZFlIcG9oaWJMaA?oc=5”
Code that was working previously:
_ENCODED_URL_PREFIX = "https://news.google.com/rss/articles/"
_ENCODED_URL_RE = re.compile(fr"^{re.escape(_ENCODED_URL_PREFIX)}(?P<encoded_url>[^?]+)")
_DECODED_URL_RE = re.compile(rb'^x08x13".+?(?P<primary_url>http[^xd2]+)xd2x01')
@functools.lru_cache(2048)
def _decode_google_news_url(self, url: str) -> str:
match = _ENCODED_URL_RE.match(url)
encoded_text = match.groupdict()["encoded_url"] # type: ignore
encoded_text += "===" # Fix incorrect padding. Ref: /a/49459036/
decoded_text = base64.urlsafe_b64decode(encoded_text)
match = _DECODED_URL_RE.match(decoded_text)
primary_url = match.groupdict()["primary_url"] # type: ignore
primary_url = primary_url.decode()
return primary_url