feat: 新增抖音视频/图片解析 skill

2026-05-11 14:32:26 +08:00 · 2026-05-11 14:32:26 +08:00 · 5dbae039d4
commit 5dbae039d4
parent 4c4a3c5c95
2 changed files with 398 additions and 0 deletions
--- a/skills/douyin-video-parse/SKILL.md
+++ b/skills/douyin-video-parse/SKILL.md
@ -0,0 +1,53 @@
+---
+name: douyin-video-parse
+description: "当用户发送包含抖音短链接（https://v.douyin.com/xxx）的消息时触发。自动解析抖音视频/图片，并发送给当前用户。"
+argument-hint: "消息中包含抖音短链接即可自动触发"
+---
+
+# Douyin Video Parse Skill
+
+## 描述
+
+这是一个用于解析抖音短视频/图片的技能。
+
+当用户发送的消息中包含 `https://v.douyin.com/` 链接时，自动解析该链接对应的视频或图片，并通过本地微信机器人接口发送给当前用户。
+
+这个仓库里额外提供了一个可执行脚本 `scripts/douyin_video_parse.py`，方便宿主机器人直接调用。
+
+## 触发条件
+
+- 用户消息中包含 `https://v.douyin.com/` 链接
+
+## 解析原理
+
+1. 访问抖音短链接，跟随 302 重定向获取真实页面 URL
+2. 请求真实页面 HTML，从中提取 `window._ROUTER_DATA` JSON 数据
+3. 从 JSON 中解析出视频播放地址或图片列表
+4. 通过本地微信机器人接口发送视频或图片
+
+## 环境变量
+
+- `ROBOT_WECHAT_CLIENT_PORT`：本地微信机器人服务端口。
+- `ROBOT_FROM_WX_ID`：当前消息来源用户的 wxid。
+- `ROBOT_MESSAGE_CONTENT`：用户发送的原始消息内容（用于提取抖音链接）。
+
+## 执行步骤
+
+1. 当用户消息中包含 `https://v.douyin.com/` 链接时触发该技能。
+2. 在仓库根目录下执行本地脚本：`python3 scripts/douyin_video_parse.py`。
+3. 脚本从环境变量 `ROBOT_MESSAGE_CONTENT` 中提取抖音短链接。
+4. 脚本访问短链接，跟随重定向获取真实页面 URL。
+5. 脚本请求真实页面，解析 `window._ROUTER_DATA` 中的视频/图片信息。
+6. 如果是视频：
+   - 先发送分享卡片链接
+   - 再调用 `POST http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/video/url` 发送视频
+7. 如果是图片：
+   - 发送文字提示（作者、标题、图片数量）
+   - 调用 `POST http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/image/url` 逐张发送图片
+8. 如果解析失败，回复兜底文案：`抖音解析失败，可能是链接已失效或格式不正确。`
+
+## 回复要求
+
+- 视频类型：发送视频文件，附带作者和标题信息。
+- 图片类型：发送所有图片，附带作者和标题信息。
+- 失败时，使用固定兜底文案回复。
--- a/skills/douyin-video-parse/scripts/douyin_video_parse.py
+++ b/skills/douyin-video-parse/scripts/douyin_video_parse.py
@ -0,0 +1,345 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import html
+import json
+import os
+import re
+import sys
+import traceback
+import urllib.error
+import urllib.parse
+import urllib.request
+
+
+sys.stderr = sys.stdout
+
+
+DOUYIN_USER_AGENT = (
+    "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) "
+    "AppleWebKit/605.1.15 (KHTML, like Gecko) "
+    "Version/14.0 Mobile/15E148 Safari/604.1"
+)
+DOUYIN_REFERER = "https://www.douyin.com/"
+FALLBACK_TEXT = "抖音解析失败，可能是链接已失效或格式不正确。"
+ROUTER_DATA_RE = re.compile(r"(?s)window\._ROUTER_DATA\s*=\s*(\{.*?\})\s*</script>")
+DOUYIN_URL_RE = re.compile(r"https://[^\s]+")
+
+
+def build_request(url: str) -> urllib.request.Request:
+    return urllib.request.Request(
+        url,
+        headers={
+            "User-Agent": DOUYIN_USER_AGENT,
+            "Referer": DOUYIN_REFERER,
+        },
+    )
+
+
+def resolve_redirect(short_url: str) -> str | None:
+    """Follow the 302 redirect to get the real page URL."""
+
+    class NoRedirectHandler(urllib.request.HTTPRedirectHandler):
+        def redirect_request(self, req, fp, code, msg, headers, newurl):
+            return None
+
+    opener = urllib.request.build_opener(NoRedirectHandler)
+    req = build_request(short_url)
+    try:
+        response = opener.open(req, timeout=15)
+        return response.url
+    except urllib.error.HTTPError as e:
+        location = e.headers.get("Location")
+        if location:
+            return location
+        return None
+    except (urllib.error.URLError, TimeoutError):
+        return None
+
+
+def fetch_page_html(page_url: str) -> str | None:
+    """Fetch the Douyin page HTML content."""
+    req = build_request(page_url)
+    try:
+        with urllib.request.urlopen(req, timeout=15) as response:
+            if response.status != 200:
+                return None
+            return response.read().decode("utf-8", errors="replace")
+    except (urllib.error.URLError, TimeoutError):
+        return None
+
+
+def decode_escaped_value(value: str) -> str:
+    """Decode HTML entities and JSON escape sequences."""
+    decoded = html.unescape(value)
+    if "\\" in decoded:
+        try:
+            unquoted = json.loads('"' + decoded.replace('"', '\\"') + '"')
+            decoded = unquoted
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return html.unescape(decoded)
+
+
+def pick_preferred_url(urls: list[str]) -> str:
+    """Pick the best URL from a list, preferring p26 CDN."""
+    first_url = ""
+    for raw_url in urls:
+        if not raw_url:
+            continue
+        decoded_url = decode_escaped_value(raw_url)
+        if not decoded_url:
+            continue
+        if decoded_url.startswith("https://p26"):
+            return decoded_url
+        if not first_url:
+            first_url = decoded_url
+    return first_url
+
+
+def pick_video_url(urls: list[str]) -> str:
+    """Pick the best video URL, preferring aweme.snssdk.com."""
+    decoded_urls = []
+    for raw_url in urls:
+        if not raw_url:
+            continue
+        decoded_url = decode_escaped_value(raw_url).replace("playwm", "play")
+        decoded_urls.append(decoded_url)
+
+    for url in decoded_urls:
+        if "aweme.snssdk.com" in url:
+            return url
+    return decoded_urls[0] if decoded_urls else ""
+
+
+def extract_aweme_item(html_content: str) -> dict | None:
+    """Extract the first aweme item from _ROUTER_DATA."""
+    match = ROUTER_DATA_RE.search(html_content)
+    if not match:
+        return None
+
+    try:
+        router_data = json.loads(match.group(1))
+    except json.JSONDecodeError:
+        return None
+
+    loader_data = router_data.get("loaderData", {})
+    for page_data in loader_data.values():
+        if not isinstance(page_data, dict):
+            continue
+        video_info_res = page_data.get("videoInfoRes", {})
+        item_list = video_info_res.get("item_list", [])
+        if item_list:
+            return item_list[0]
+    return None
+
+
+def parse_note_item(item: dict) -> dict | None:
+    """Parse image/note type content."""
+    images = item.get("images") or item.get("image_infos") or []
+    if not images:
+        return None
+
+    image_urls = []
+    seen = set()
+    for img_info in images:
+        url_list = img_info.get("url_list", [])
+        for url in url_list:
+            if url and url.startswith("http"):
+                decoded = html.unescape(url)
+                if decoded not in seen:
+                    image_urls.append(decoded)
+                    seen.add(decoded)
+                    break
+
+    if not image_urls:
+        return None
+
+    author = item.get("author", {})
+    music = item.get("music", {})
+    music_url = pick_preferred_url(music.get("play_url", {}).get("url_list", []))
+
+    # Fallback music URL from video play_addr
+    if not music_url:
+        video = item.get("video", {})
+        play_addr = video.get("play_addr", {})
+        uri = play_addr.get("uri", "")
+        if uri.startswith("http"):
+            music_url = decode_escaped_value(uri)
+        else:
+            music_url = pick_preferred_url(play_addr.get("url_list", []))
+
+    return {
+        "type": "note",
+        "author": html.unescape(author.get("nickname", "")),
+        "title": html.unescape(item.get("desc", "")),
+        "images": image_urls,
+        "music_url": music_url,
+    }
+
+
+def parse_video_item(item: dict) -> dict | None:
+    """Parse video type content."""
+    video = item.get("video", {})
+    duration = video.get("duration")
+    if duration is not None and duration == 0:
+        return None
+
+    play_addr = video.get("play_addr", {})
+    video_url = pick_video_url(play_addr.get("url_list", []))
+    if not video_url:
+        return None
+
+    author = item.get("author", {})
+    return {
+        "type": "video",
+        "author": html.unescape(author.get("nickname", "")),
+        "title": html.unescape(item.get("desc", "")),
+        "url": video_url,
+        "cover": pick_preferred_url(video.get("cover", {}).get("url_list", [])),
+    }
+
+
+def parse_douyin(short_url: str) -> dict | None:
+    """Main parsing logic: resolve redirect -> fetch HTML -> extract data."""
+    resolved_url = resolve_redirect(short_url)
+    if not resolved_url:
+        return None
+
+    html_content = fetch_page_html(resolved_url)
+    if not html_content:
+        return None
+
+    item = extract_aweme_item(html_content)
+    if not item:
+        return None
+
+    # Try note (images) first, then video
+    result = parse_note_item(item)
+    if result:
+        return result
+
+    result = parse_video_item(item)
+    if result:
+        return result
+
+    return None
+
+
+def send_video(video_url: str, robot_port: str, to_wxid: str) -> bool:
+    """Send video via local robot API."""
+    api_url = f"http://127.0.0.1:{robot_port}/api/v1/robot/message/send/video/url"
+    body = json.dumps({
+        "to_wxid": to_wxid,
+        "video_urls": [video_url],
+    }).encode("utf-8")
+    request = urllib.request.Request(
+        api_url,
+        data=body,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(request, timeout=60) as response:
+            return 200 <= response.status < 300
+    except (urllib.error.URLError, TimeoutError):
+        return False
+
+
+def send_images(image_urls: list[str], robot_port: str, to_wxid: str) -> bool:
+    """Send images via local robot API."""
+    api_url = f"http://127.0.0.1:{robot_port}/api/v1/robot/message/send/image/url"
+    body = json.dumps({
+        "to_wxid": to_wxid,
+        "image_urls": image_urls,
+    }).encode("utf-8")
+    request = urllib.request.Request(
+        api_url,
+        data=body,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(request, timeout=60) as response:
+            return 200 <= response.status < 300
+    except (urllib.error.URLError, TimeoutError):
+        return False
+
+
+def send_text(text: str, robot_port: str, to_wxid: str) -> bool:
+    """Send text message via local robot API."""
+    api_url = f"http://127.0.0.1:{robot_port}/api/v1/robot/message/send/text"
+    body = json.dumps({
+        "to_wxid": to_wxid,
+        "content": text,
+    }).encode("utf-8")
+    request = urllib.request.Request(
+        api_url,
+        data=body,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(request, timeout=10) as response:
+            return 200 <= response.status < 300
+    except (urllib.error.URLError, TimeoutError):
+        return False
+
+
+def main() -> int:
+    robot_port = os.environ.get("ROBOT_WECHAT_CLIENT_PORT", "").strip()
+    to_wxid = os.environ.get("ROBOT_FROM_WX_ID", "").strip()
+    message_content = os.environ.get("ROBOT_MESSAGE_CONTENT", "").strip()
+
+    if not robot_port or not to_wxid or not message_content:
+        sys.stdout.write(FALLBACK_TEXT + "\n")
+        return 0
+
+    # Extract douyin URL from message
+    matches = DOUYIN_URL_RE.findall(message_content)
+    douyin_urls = [u for u in matches if "v.douyin.com" in u]
+    if not douyin_urls:
+        sys.stdout.write(FALLBACK_TEXT + "\n")
+        return 0
+
+    douyin_url = douyin_urls[0]
+    result = parse_douyin(douyin_url)
+    if not result:
+        sys.stdout.write(FALLBACK_TEXT + "\n")
+        return 0
+
+    if result["type"] == "video":
+        # Send info text
+        info_text = f"抖音视频解析成功\n作者: {result['author']}\n标题: {result['title']}"
+        send_text(info_text, robot_port, to_wxid)
+        # Send video
+        if not send_video(result["url"], robot_port, to_wxid):
+            sys.stdout.write("发送抖音视频失败，请稍后重试。\n")
+            return 0
+
+    elif result["type"] == "note":
+        # Send info text
+        info_text = (
+            f"抖音图片解析成功\n"
+            f"作者: {result['author']}\n"
+            f"标题: {result['title']}\n\n"
+            f"{len(result['images'])}张图片正在发送中..."
+        )
+        send_text(info_text, robot_port, to_wxid)
+        # Send images
+        if not send_images(result["images"], robot_port, to_wxid):
+            sys.stdout.write("发送抖音图片失败，请稍后重试。\n")
+            return 0
+
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except SystemExit:
+        raise
+    except Exception:
+        traceback.print_exc(file=sys.stdout)
+        raise SystemExit(1)