feat: weserv via relay

Fixes #369 Signed-off-by: Rongrong <[email protected]>
Rongronggg9 · Dec 2, 2023 · 95b8e17 · 95b8e17
1 parent 1b9e3e4
commit 95b8e17
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 12 deletions.
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## Unreleased
 
+### Enhancements
+
+- **`wsrv.nl` via relay**: Try to use `wsrv.nl` (environment variable `IMAGES_WESERV_NL`) via the media relay server (environment variable `IMG_RELAY_SERVER`). This is a workaround for images from domains/TLDs banned by `wsrv.nl` or CDNs that ban `wsrv.nl`. It can hopefully reduce the frequency of seeing "invalid media" in messages since RSStT uses `wsrv.nl` heavily to convert images into formats accepted by Telegram DCs. See also [#369](https://github.com/Rongronggg9/RSS-to-Telegram-Bot/issues/369).
+
 ### Bug fixes
 
 - **"Remote" `/test` unavailable**: Fix a bug preventing the bot manager from using the `/test` command "remotely".

diff --git a/docs/CHANGELOG.zh.md b/docs/CHANGELOG.zh.md
@@ -2,6 +2,10 @@
 
 ## 尚未发布
 
+### 增强
+
+- **经反代的 `wsrv.nl`: 尝试通过媒体反代服务器 (环境变量 `IMG_RELAY_SERVER`) 使用 `wsrv.nl` (环境变量 `IMAGES_WESERV_NL`)。这是对那些来自被 `wsrv.nl` 封禁的域名或将 `wsrv.nl` 封禁的 CDN 的图片的变通解决方案。考虑到 RSStT 大量使用 `wsrv.nl` 将图片转换为 Telegram DC 所接受的格式，这有望减少在消息中见到 "Invalid media" 的频率。另请参阅 [#369](https://github.com/Rongronggg9/RSS-to-Telegram-Bot/issues/369)。
+
 ### Bug 修复
 
 - **“远程” `/test` 不可用**：修复阻止 bot 管理员“远程”使用 `/test` 命令的错误。

diff --git a/src/parsing/medium.py b/src/parsing/medium.py
@@ -323,13 +323,20 @@ def flushed_log():
                     return True
                 medium_info = await web.get_medium_info(url)
                 if medium_info is None:
-                    if url.startswith(env.IMAGES_WESERV_NL) or url.startswith(env.IMG_RELAY_SERVER):
-                        invalid_reasons.append('fetch failed')
-                        continue
-                    medium_info = await web.get_medium_info(env.IMG_RELAY_SERVER + url)
-                    if medium_info is None:
-                        invalid_reasons.append('both original and relayed image fetch failed')
+                    if url.startswith(env.IMG_RELAY_SERVER):
+                        invalid_reasons.append('relayed image fetch failed')
                         continue
+                    elif url.startswith(env.IMAGES_WESERV_NL):
+                        url = insert_image_relay_into_weserv_url(url)
+                        medium_info = url and await web.get_medium_info(url)
+                        if medium_info is None:
+                            invalid_reasons.append('weserv fetch failed')
+                            continue
+                    else:
+                        medium_info = await web.get_medium_info(env.IMG_RELAY_SERVER + url)
+                        if medium_info is None:
+                            invalid_reasons.append('both original and relayed image fetch failed')
+                            continue
                 self.size, self.width, self.height, self.content_type = medium_info
                 if self.type == IMAGE and self.size <= self.maxSize and min(self.width, self.height) == -1 \
                         and self.content_type and self.content_type.startswith('image') \
@@ -461,14 +468,19 @@ async def change_server(self) -> bool:
             return False
         self._server_change_count += 1
         self.chosen_url = env.IMG_RELAY_SERVER + self.chosen_url
-        if not env.TRAFFIC_SAVING:
-            # noinspection PyBroadException
-            try:
-                await web.get(url=self.chosen_url, semaphore=False, max_size=0)  # let the img relay sever cache the img
-            except Exception:
-                pass
+        await self._try_get_chosen_url()  # let the relay sever cache it
         return True
 
+    async def _try_get_chosen_url(self) -> bool:
+        if env.TRAFFIC_SAVING:
+            return True
+        # noinspection PyBroadException
+        try:
+            await web.get(url=self.chosen_url, semaphore=False, max_size=0)
+            return True
+        except Exception:
+            return False
+
     def __bool__(self):
         if self.valid is None:
             raise RuntimeError('You must validate a medium before judging its validation')
@@ -555,6 +567,15 @@ def __init__(self, urls: Union[str, list[str]]):
         self.chosen_url = self.urls[0]
 
     async def change_server(self) -> bool:
+        if weserv_relayed := insert_image_relay_into_weserv_url(self.chosen_url):
+            # success if:
+            # 1. it is a weserv URL; and
+            # 2. it is called the first time.
+            # here we don't need to increase _server_change_count because the second call will just return None
+            self.chosen_url = weserv_relayed
+            await self._try_get_chosen_url()  # let the relay sever and weserv cache the image
+            return True
+
         sinaimg_server_match = sinaimg_server_parser(self.chosen_url)
         if not sinaimg_server_match:  # is not a sinaimg img
             return await super().change_server()
@@ -989,6 +1010,25 @@ def construct_weserv_url_convert_to_jpg(url: str) -> str:
     return construct_weserv_url(url, output_format='jpg')
 
 
+HEAD_IMAGES_WESERV_NL_URL: Final = construct_weserv_url('')
+HEAD_IMAGES_WESERV_NL_URL_RELAYED: Final = construct_weserv_url(env.IMG_RELAY_SERVER)
+LEN_HEAD_IMAGES_WESERV_NL_URL: Final = len(HEAD_IMAGES_WESERV_NL_URL)
+
+
+def insert_image_relay_into_weserv_url(url: str) -> Optional[str]:
+    """
+    Ensure weserv fetches the image via the relay server.
+    Useful when:
+    1. The image is from a domain/TLD banned by weserv; or
+    2. The image is from a CDN that bans weserv.
+    """
+    if not url.startswith(HEAD_IMAGES_WESERV_NL_URL):
+        return None  # not a weserv url
+    if url.startswith(HEAD_IMAGES_WESERV_NL_URL_RELAYED):
+        return None  # already relayed
+    return HEAD_IMAGES_WESERV_NL_URL_RELAYED + url[LEN_HEAD_IMAGES_WESERV_NL_URL:]
+
+
 async def detect_image_dimension_via_weserv(url: str) -> tuple[int, int]:
     url = construct_weserv_url_convert_to_jpg(url)
     res = await web.get_medium_info(url)