fix: 修复抖音视频下载403,添加User-Agent和Referer请求头

This commit is contained in:
houhou 2026-05-11 14:51:58 +08:00
parent 5dbae039d4
commit 74cc06bc27
33 changed files with 3183 additions and 5266 deletions

2
.gitignore vendored
View File

@ -1,2 +0,0 @@
__pycache__
.venv

26
.vscode/launch.json vendored
View File

@ -1,26 +0,0 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "text-to-image",
"type": "debugpy",
"request": "launch",
"program": "skills/text-to-image/scripts/text_to_image.py",
"console": "integratedTerminal",
"justMyCode": true,
"args": [
"--prompt=马云在直播间卖红薯",
"--model=gpt-image-2"
],
"env": {
"ROBOT_WECHAT_CLIENT_PORT": "9001",
"ROBOT_FROM_WX_ID": "57004904192@chatroom",
"ROBOT_CODE": "houhouipad",
"MYSQL_HOST": "127.0.0.1",
"MYSQL_PORT": "3306",
"MYSQL_USER": "root",
"MYSQL_PASSWORD": "houhou"
}
}
]
}

118
README.md
View File

@ -1,118 +0,0 @@
# wechat-robot-skills
微信机器人 Skills
**系统自动注入的环境变量**
- ROBOT_WECHAT_CLIENT_PORT: 机器人客户端服务端口,可用于在 SKILL 脚本直接调用客户端接口 `http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/xxxxx`
- ROBOT_ID: 机器人实例 ID
- ROBOT_CODE: 机器人实例编码
- MYSQL_HOST: mysql 地址
- MYSQL_PORT: mysql 端口
- MYSQL_USER: mysql 账号
- MYSQL_PASSWORD: mysql 密码
- ROBOT_REDIS_DB: 机器人的 Redis DB
- ROBOT_WX_ID: 机器人的微信 ID
- ROBOT_FROM_WX_ID: 微信消息来源(群聊 ID 或者好友微信 ID)
- ROBOT_SENDER_WX_ID: 微信消息发送人的微信 ID
- ROBOT_MESSAGE_ID: 微信消息 ID
- ROBOT_REF_MESSAGE_ID: 如果是引用消息,则是引用的消息的 ID
**需要发送图片的时候可以在控制台输出如下内容**
```
<wechat-robot-image-url>图片URL1</wechat-robot-image-url>
<wechat-robot-image-url>图片URL2</wechat-robot-image-url>
<wechat-robot-image-url>图片URL3</wechat-robot-image-url>
<wechat-robot-image-url>图片URL4</wechat-robot-image-url>
```
**需要发送视频的时候可以在控制台输出如下内容**
```
<wechat-robot-video-url>视频URL1</wechat-robot-video-url>
<wechat-robot-video-url>视频URL2</wechat-robot-video-url>
```
**需要发语音的时候可以在控制台输出如下内容**
```
<wechat-robot-voice-url>语音URL1</wechat-robot-voice-url>
<wechat-robot-voice-url>语音URL2</wechat-robot-voice-url>
```
**发送图片的时候也可以调用 Agent 接口**
1. 发送远程图片地址
```
[POST] http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1//robot/message/send/image/url
请求体 Body:
{
"to_wxid": "{{ROBOT_FROM_WX_ID}}",
"image_urls": ["{{imageurl}}"]
}
```
2. 发送本地图片路径
```
[POST] http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1//robot/message/send/image/local
请求体 Body:
{
"to_wxid": "{{ROBOT_FROM_WX_ID}}",
"file_path": "{{file_path}}"
}
```
**发送视频的时候也可以调用 Agent 接口**
```
[POST] http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/video/url
请求体 Body:
{
"to_wxid": "{{ROBOT_FROM_WX_ID}}",
"video_urls": ["{{videourl}}"]
}
```
**发送语音的时候也可以调用 Agent 接口**
```
[POST] http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/voice
说明:
该接口用于上传语音文件并发送给指定微信用户或群聊。
请求方式为 multipart/form-data支持 .amr、.mp3、.wav 格式,单个文件大小不能超过 50MB。
表单参数:
- to_wxid: 接收方微信 ID必填
- voice: 语音文件,必填
请求体 Body:
{
"to_wxid": "{{ROBOT_FROM_WX_ID}}",
"voice": "@/path/to/voice.amr"
}
```

846
douyin_video_parse.go Normal file
View File

@ -0,0 +1,846 @@
package plugins
import (
"bytes"
"context"
"crypto/md5"
"encoding/hex"
"encoding/json"
"fmt"
"html"
"image"
"image/color"
"image/draw"
"image/jpeg"
_ "image/png"
"io"
"log"
"mime/multipart"
"net/http"
"net/url"
"path"
"regexp"
"strings"
"time"
"github.com/go-resty/resty/v2"
xdraw "golang.org/x/image/draw"
_ "golang.org/x/image/webp"
"wechat-robot-client/dto"
"wechat-robot-client/interface/plugin"
"wechat-robot-client/pkg/robot"
"wechat-robot-client/utils"
"wechat-robot-client/vars"
)
type VideoParseResponse struct {
Code int `json:"code"`
Msg string `json:"msg"`
Data VideoParseData `json:"data"`
}
type VideoParseData struct {
Author string `json:"author"`
Avatar string `json:"avatar"`
Title string `json:"title"`
Desc string `json:"desc"`
Digg int32 `json:"digg"`
Comment int32 `json:"comment"`
Play int32 `json:"play"`
CreateTime int64 `json:"create_time"`
Cover string `json:"cover"`
URL string `json:"url"`
Images []string `json:"images"`
MusicURL string `json:"music_url"`
}
type DouyinRouterData struct {
LoaderData map[string]DouyinLoaderPageData `json:"loaderData"`
}
type DouyinLoaderPageData struct {
VideoInfoRes DouyinVideoInfoRes `json:"videoInfoRes"`
}
type DouyinVideoInfoRes struct {
ItemList []DouyinAwemeItem `json:"item_list"`
}
type DouyinAwemeItem struct {
Desc string `json:"desc"`
Author DouyinAuthor `json:"author"`
Music DouyinMusic `json:"music"`
Video DouyinVideo `json:"video"`
Images []DouyinImageInfo `json:"images"`
ImageInfos []DouyinImageInfo `json:"image_infos"`
ImgBitrate []DouyinImageGear `json:"img_bitrate"`
}
type DouyinAuthor struct {
Nickname string `json:"nickname"`
Signature string `json:"signature"`
AvatarThumb DouyinURLResource `json:"avatar_thumb"`
AvatarMedium DouyinURLResource `json:"avatar_medium"`
}
type DouyinMusic struct {
Mid string `json:"mid"`
Title string `json:"title"`
Author string `json:"author"`
PlayURL DouyinURLResource `json:"play_url"`
CoverHD DouyinURLResource `json:"cover_hd"`
CoverLarge DouyinURLResource `json:"cover_large"`
CoverMedium DouyinURLResource `json:"cover_medium"`
CoverThumb DouyinURLResource `json:"cover_thumb"`
}
type DouyinVideo struct {
Duration *int64 `json:"duration"`
PlayAddr DouyinURLResource `json:"play_addr"`
Cover DouyinURLResource `json:"cover"`
}
type DouyinImageInfo struct {
URI string `json:"uri"`
URLList []string `json:"url_list"`
DownloadURLList []string `json:"download_url_list"`
}
type DouyinImageGear struct {
Name string `json:"name"`
Images []DouyinImageInfo `json:"images"`
}
type DouyinURLResource struct {
URI string `json:"uri"`
URLList []string `json:"url_list"`
}
const douyinUserAgent = "Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"
var (
douyinRouterDataRegexp = regexp.MustCompile(`(?s)window\._ROUTER_DATA\s*=\s*({.*?})\s*</script>`)
)
type DouyinVideoParsePlugin struct{}
func NewDouyinVideoParsePlugin() plugin.MessageHandler {
return &DouyinVideoParsePlugin{}
}
func (p *DouyinVideoParsePlugin) GetName() string {
return "DouyinVideoParse"
}
func (p *DouyinVideoParsePlugin) GetLabels() []string {
return []string{"text", "douyin"}
}
func (p *DouyinVideoParsePlugin) PreAction(ctx *plugin.MessageContext) bool {
if ctx.Message.IsChatRoom {
next := NewChatRoomCommonPlugin().PreAction(ctx)
if !next {
return false
}
if !ctx.Settings.IsShortVideoParsingEnabled() {
return false
}
}
return true
}
func (p *DouyinVideoParsePlugin) PostAction(ctx *plugin.MessageContext) {
}
func (p *DouyinVideoParsePlugin) Match(ctx *plugin.MessageContext) bool {
return strings.Contains(ctx.Message.Content, "https://v.douyin.com")
}
func (p *DouyinVideoParsePlugin) Run(ctx *plugin.MessageContext) {
if !p.PreAction(ctx) {
return
}
re := regexp.MustCompile(`https://[^\s]+`)
matches := re.FindAllString(ctx.Message.Content, -1)
if len(matches) == 0 {
ctx.MessageService.SendTextMessage(ctx.Message.FromWxID, "未找到抖音链接")
return
}
douyinURL := matches[0]
respData, err := parseDouyinVideo(douyinURL)
if err != nil {
ctx.MessageService.SendTextMessage(ctx.Message.FromWxID, fmt.Sprintf("解析失败: %v", err))
return
}
if respData.Data.URL != "" {
shareLink := robot.ShareLinkMessage{
Title: fmt.Sprintf("抖音视频解析成功 - %s", respData.Data.Author),
Des: respData.Data.Title,
Url: respData.Data.URL,
ThumbUrl: robot.CDATAString("https://mmbiz.qpic.cn/mmbiz_png/NbW0ZIUM8lVHoUbjXw2YbYXbNJDtUH7Sbkibm9Qwo9FhAiaEFG4jY3Q2MEleRpiaWDyDv8BZUfR85AW3kG4ib6DyAw/640?wx_fmt=png"),
}
if respData.Data.Desc != "" {
shareLink.Des = respData.Data.Desc
}
_ = ctx.MessageService.ShareLink(ctx.Message.FromWxID, shareLink)
err = ctx.MessageService.SendVideoMessageByRemoteURL(ctx.Message.FromWxID, respData.Data.URL)
if err != nil {
ctx.MessageService.SendTextMessage(ctx.Message.FromWxID, fmt.Sprintf("发送抖音视频失败: %v", err.Error()))
}
return
}
if len(respData.Data.Images) > 0 {
ctx.MessageService.SendTextMessage(ctx.Message.FromWxID, fmt.Sprintf("抖音图片解析成功\n作者: %s\n标题: %s\n\n%d张图片正在发送中...", respData.Data.Author, respData.Data.Title, len(respData.Data.Images)))
if respData.Data.MusicURL != "" {
go func(musicURL, title, author string) {
var err error
if isAudioURL(musicURL) {
err = sendMusicMessageByURL(ctx, musicURL, author)
} else {
err = sendFileByRemoteURL(ctx, musicURL)
}
if err != nil {
ctx.MessageService.SendTextMessage(ctx.Message.FromWxID, fmt.Sprintf("发送抖音音频失败: %v", err))
}
}(respData.Data.MusicURL, respData.Data.Title, respData.Data.Author)
}
imageURLs := respData.Data.Images
batchSize := 20
for i := 0; i < len(imageURLs); i += batchSize {
end := i + batchSize
end = min(end, len(imageURLs))
mergedImage, err := mergeImagesVertical(ctx, imageURLs[i:end])
if err != nil {
if isImageTooLargeError(err) {
p.sendImagesInSmallerBatches(ctx, imageURLs[i:end], 10)
continue
}
ctx.MessageService.SendTextMessage(ctx.Message.FromWxID, fmt.Sprintf("拼接失败(批次 %d-%d): %v", i+1, end, err))
continue
}
if len(mergedImage) == 0 {
continue
}
err = sendMergedImage(ctx, mergedImage)
if err != nil {
ctx.MessageService.SendTextMessage(ctx.Message.FromWxID, fmt.Sprintf("发送图片失败: %v", err))
}
}
return
}
ctx.MessageService.SendTextMessage(ctx.Message.FromWxID, "解析失败,可能是链接已失效或格式不正确")
}
func parseDouyinVideo(rawURL string) (VideoParseResponse, error) {
resolvedURL, err := resolveDouyinRedirect(rawURL)
if err != nil {
return VideoParseResponse{}, err
}
htmlContent, err := fetchDouyinPageHTML(resolvedURL)
if err != nil {
return VideoParseResponse{}, err
}
data, err := parseDouyinPageHTML(htmlContent)
if err != nil {
return VideoParseResponse{}, err
}
return VideoParseResponse{Code: http.StatusOK, Data: data}, nil
}
func resolveDouyinRedirect(rawURL string) (string, error) {
client := &http.Client{
Timeout: 15 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
},
}
req, err := http.NewRequestWithContext(context.Background(), http.MethodGet, rawURL, nil)
if err != nil {
return "", fmt.Errorf("创建抖音短链请求失败: %w", err)
}
req.Header.Set("User-Agent", douyinUserAgent)
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("解析抖音短链失败: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode >= http.StatusMultipleChoices && resp.StatusCode < http.StatusBadRequest {
location, err := resp.Location()
if err != nil {
return rawURL, nil
}
return location.String(), nil
}
return resp.Request.URL.String(), nil
}
func fetchDouyinPageHTML(pageURL string) (string, error) {
client := &http.Client{Timeout: 15 * time.Second}
req, err := http.NewRequestWithContext(context.Background(), http.MethodGet, pageURL, nil)
if err != nil {
return "", fmt.Errorf("创建抖音页面请求失败: %w", err)
}
req.Header.Set("User-Agent", douyinUserAgent)
resp, err := client.Do(req)
if err != nil {
return "", fmt.Errorf("获取抖音页面失败: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("获取抖音页面失败,状态码: %d", resp.StatusCode)
}
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("读取抖音页面失败: %w", err)
}
if len(body) == 0 {
return "", fmt.Errorf("抖音页面内容为空")
}
return string(body), nil
}
func parseDouyinPageHTML(htmlContent string) (VideoParseData, error) {
if item, ok := extractDouyinAwemeItem(htmlContent); ok {
if note, ok := parseDouyinNoteItem(item); ok {
return note, nil
}
if video, ok := parseDouyinVideoItem(item); ok {
return video, nil
}
}
return VideoParseData{}, fmt.Errorf("阿拉蕾,解析出错了~")
}
func extractDouyinAwemeItem(htmlContent string) (DouyinAwemeItem, bool) {
match := douyinRouterDataRegexp.FindStringSubmatch(htmlContent)
if len(match) < 2 {
return DouyinAwemeItem{}, false
}
var routerData DouyinRouterData
if err := json.Unmarshal([]byte(match[1]), &routerData); err != nil {
log.Printf("解析抖音 _ROUTER_DATA 失败: %v\n", err)
return DouyinAwemeItem{}, false
}
for _, pageData := range routerData.LoaderData {
if len(pageData.VideoInfoRes.ItemList) > 0 {
return pageData.VideoInfoRes.ItemList[0], true
}
}
return DouyinAwemeItem{}, false
}
func parseDouyinNoteItem(item DouyinAwemeItem) (VideoParseData, bool) {
imageURLGroups := pickDouyinImageURLGroups(item)
if len(imageURLGroups) == 0 {
return VideoParseData{}, false
}
imageURLs := make([]string, 0, len(imageURLGroups))
for _, group := range imageURLGroups {
imageURLs = append(imageURLs, group[0])
}
desc := cleanDouyinText(item.Desc)
return VideoParseData{
Author: cleanDouyinText(item.Author.Nickname),
Avatar: pickDouyinAvatarURL(item.Author),
Title: desc,
Desc: desc,
Images: imageURLs,
MusicURL: pickDouyinNoteMusicURL(item),
}, true
}
func pickDouyinImageURLGroups(item DouyinAwemeItem) [][]string {
imageList := item.Images
if len(imageList) == 0 {
imageList = item.ImageInfos
}
imageURLGroups := make([][]string, 0, len(imageList))
seenGroups := make(map[string]bool)
for _, imageInfo := range imageList {
candidates := make([]string, 0)
seenURLs := make(map[string]bool)
for _, imageURL := range imageInfo.URLList {
if !strings.HasPrefix(imageURL, "http") {
continue
}
decodedURL := html.UnescapeString(imageURL)
if seenURLs[decodedURL] {
continue
}
candidates = append(candidates, decodedURL)
seenURLs[decodedURL] = true
}
groupKey := strings.Join(candidates, "\x00")
if len(candidates) > 0 && !seenGroups[groupKey] {
imageURLGroups = append(imageURLGroups, candidates)
seenGroups[groupKey] = true
}
}
return imageURLGroups
}
func parseDouyinVideoItem(item DouyinAwemeItem) (VideoParseData, bool) {
if item.Video.Duration != nil && *item.Video.Duration == 0 {
return VideoParseData{}, false
}
videoURL := pickDouyinVideoURL(item.Video.PlayAddr.URLList)
if videoURL == "" {
return VideoParseData{}, false
}
desc := cleanDouyinText(item.Desc)
return VideoParseData{
Author: cleanDouyinText(item.Author.Nickname),
Avatar: pickDouyinAvatarURL(item.Author),
Title: desc,
Desc: desc,
Cover: pickPreferredDouyinURL(item.Video.Cover.URLList),
URL: videoURL,
MusicURL: pickPreferredDouyinURL(item.Music.PlayURL.URLList),
}, true
}
func pickDouyinAvatarURL(author DouyinAuthor) string {
if avatarURL := pickPreferredDouyinURL(author.AvatarMedium.URLList); avatarURL != "" {
return avatarURL
}
return pickPreferredDouyinURL(author.AvatarThumb.URLList)
}
func pickDouyinNoteMusicURL(item DouyinAwemeItem) string {
if musicURL := pickPreferredDouyinURL(item.Music.PlayURL.URLList); musicURL != "" {
return musicURL
}
if strings.HasPrefix(item.Video.PlayAddr.URI, "http") {
return decodeDouyinEscapedValue(item.Video.PlayAddr.URI)
}
return pickPreferredDouyinURL(item.Video.PlayAddr.URLList)
}
func pickDouyinVideoURL(urls []string) string {
decodedURLs := make([]string, 0, len(urls))
for _, rawURL := range urls {
if rawURL == "" {
continue
}
decodedURL := strings.ReplaceAll(decodeDouyinEscapedValue(rawURL), "playwm", "play")
decodedURLs = append(decodedURLs, decodedURL)
}
for _, decodedURL := range decodedURLs {
if strings.Contains(decodedURL, "aweme.snssdk.com") {
return decodedURL
}
}
if len(decodedURLs) > 0 {
return decodedURLs[0]
}
return ""
}
func pickPreferredDouyinURL(urls []string) string {
firstURL := ""
for _, rawURL := range urls {
if rawURL == "" {
continue
}
decodedURL := decodeDouyinEscapedValue(rawURL)
if decodedURL == "" {
continue
}
if strings.HasPrefix(decodedURL, "https://p26") {
return decodedURL
}
if firstURL == "" {
firstURL = decodedURL
}
}
return firstURL
}
func matchDouyinJSONString(text string, key string) string {
pattern := regexp.MustCompile(fmt.Sprintf(`"%s":\s*"([^"]*)"`, regexp.QuoteMeta(key)))
match := pattern.FindStringSubmatch(text)
if len(match) < 2 {
return ""
}
return cleanDouyinText(decodeDouyinEscapedValue(match[1]))
}
func decodeDouyinEscapedValue(value string) string {
decodedValue := html.UnescapeString(value)
if strings.Contains(decodedValue, `\`) {
var unquotedValue string
if err := json.Unmarshal([]byte(`"`+strings.ReplaceAll(decodedValue, `"`, `\"`)+`"`), &unquotedValue); err == nil {
decodedValue = unquotedValue
}
}
return html.UnescapeString(decodedValue)
}
func cleanDouyinText(value string) string {
return strings.TrimSpace(html.UnescapeString(value))
}
func nestedString(root map[string]any, keys ...string) string {
current := any(root)
for _, key := range keys {
currentMap, ok := current.(map[string]any)
if !ok {
return ""
}
current = currentMap[key]
}
return stringFromAny(current)
}
func nestedStringList(root map[string]any, keys ...string) []string {
current := any(root)
for _, key := range keys {
currentMap, ok := current.(map[string]any)
if !ok {
return nil
}
current = currentMap[key]
}
return stringListFromAny(current)
}
func stringFromAny(value any) string {
if value == nil {
return ""
}
if str, ok := value.(string); ok {
return str
}
return fmt.Sprint(value)
}
func listFromAny(value any) []any {
if list, ok := value.([]any); ok {
return list
}
return nil
}
func stringListFromAny(value any) []string {
list, ok := value.([]any)
if !ok {
return nil
}
stringsList := make([]string, 0, len(list))
for _, item := range list {
if str, ok := item.(string); ok {
stringsList = append(stringsList, str)
}
}
return stringsList
}
func numberFromAny(value any) (float64, bool) {
switch number := value.(type) {
case float64:
return number, true
case int:
return float64(number), true
case int64:
return float64(number), true
default:
return 0, false
}
}
func (p *DouyinVideoParsePlugin) sendImagesInSmallerBatches(ctx *plugin.MessageContext, imageURLs []string, batchSize int) {
if batchSize <= 0 {
return
}
for i := 0; i < len(imageURLs); i += batchSize {
end := i + batchSize
end = min(end, len(imageURLs))
mergedImage, err := mergeImagesVertical(ctx, imageURLs[i:end])
if err != nil {
ctx.MessageService.SendTextMessage(ctx.Message.FromWxID, fmt.Sprintf("拼接失败(降级批次 %d-%d): %v", i+1, end, err))
continue
}
if len(mergedImage) == 0 {
continue
}
err = sendMergedImage(ctx, mergedImage)
if err != nil {
ctx.MessageService.SendTextMessage(ctx.Message.FromWxID, fmt.Sprintf("发送图片失败: %v", err))
}
}
}
func mergeImagesVertical(ctx *plugin.MessageContext, imageURLs []string) ([]byte, error) {
if len(imageURLs) == 0 {
return nil, fmt.Errorf("图片地址为空")
}
client := resty.New()
images := make([]image.Image, 0, len(imageURLs))
maxWidth := 0
for _, imageURL := range imageURLs {
resp, err := client.R().
SetHeader("User-Agent", douyinUserAgent).
SetHeader("Referer", "https://www.douyin.com/").
SetDoNotParseResponse(true).
Get(imageURL)
if err != nil {
return nil, fmt.Errorf("下载图片失败: %w", err)
}
if resp.StatusCode() != http.StatusOK {
resp.RawBody().Close()
return nil, fmt.Errorf("下载图片失败HTTP状态码: %d", resp.StatusCode())
}
bodyData := new(bytes.Buffer)
_, err = bodyData.ReadFrom(resp.RawBody())
resp.RawBody().Close()
if err != nil {
return nil, fmt.Errorf("读取响应体失败: %w", err)
}
if utils.IsVideo(bodyData.Bytes()) {
log.Printf("%s 解析到视频,跳过合并,直接发送视频消息\n", imageURL)
go func(toWxID, _imageURL string) {
err2 := ctx.MessageService.SendVideoMessageByRemoteURL(toWxID, _imageURL)
if err2 != nil {
ctx.MessageService.SendTextMessage(toWxID, fmt.Sprintf("发送抖音视频失败: %v", err2.Error()))
}
}(ctx.Message.FromWxID, imageURL)
continue
}
img, _, err := image.Decode(bytes.NewReader(bodyData.Bytes()))
if err != nil {
return nil, fmt.Errorf("解析图片失败: %w", err)
}
bounds := img.Bounds()
width := bounds.Dx()
if width > maxWidth {
maxWidth = width
}
images = append(images, img)
}
// 有可能全是视频
if maxWidth == 0 || len(images) == 0 {
return nil, nil
}
totalHeight := 0
for _, img := range images {
width := img.Bounds().Dx()
height := img.Bounds().Dy()
// 等比缩放计算高度
newHeight := int(float64(height) * float64(maxWidth) / float64(width))
totalHeight += newHeight
}
if maxWidth > jpegMaxDimension || totalHeight > jpegMaxDimension {
return nil, fmt.Errorf("image is too large to encode")
}
canvas := image.NewRGBA(image.Rect(0, 0, maxWidth, totalHeight))
draw.Draw(canvas, canvas.Bounds(), image.NewUniform(color.White), image.Point{}, draw.Src)
currentY := 0
for _, img := range images {
width := img.Bounds().Dx()
height := img.Bounds().Dy()
newHeight := int(float64(height) * float64(maxWidth) / float64(width))
dstRect := image.Rect(0, currentY, maxWidth, currentY+newHeight)
// 使用高质量缩放
xdraw.CatmullRom.Scale(canvas, dstRect, img, img.Bounds(), xdraw.Over, nil)
currentY += newHeight
}
var buf bytes.Buffer
if err := jpeg.Encode(&buf, canvas, &jpeg.Options{Quality: 80}); err != nil {
return nil, fmt.Errorf("图片编码失败: %w", err)
}
return buf.Bytes(), nil
}
const jpegMaxDimension = 65535
var audioExtensions = map[string]bool{
".mp3": true,
".m4a": true,
".aac": true,
".ogg": true,
".flac": true,
".wav": true,
".wma": true,
".amr": true,
}
func isAudioURL(rawURL string) bool {
parsed, err := url.Parse(rawURL)
if err != nil {
return false
}
ext := strings.ToLower(path.Ext(parsed.Path))
return audioExtensions[ext]
}
func sendMusicMessageByURL(ctx *plugin.MessageContext, musicURL, author string) error {
const (
appID = "wx8dd6ecd81906fd84"
coverURL = "https://uranus-houhou.oss-cn-beijing.aliyuncs.com/douyin.png"
)
songInfo := robot.SongInfo{}
songInfo.FromUsername = vars.RobotRuntime.WxID
songInfo.AppID = appID
songInfo.Title = "抖音解析背景音乐"
songInfo.Singer = author
songInfo.Url = musicURL
songInfo.MusicUrl = musicURL
songInfo.CoverUrl = coverURL
_, err := vars.RobotRuntime.SendMusicMessage(ctx.Message.FromWxID, songInfo)
return err
}
func isImageTooLargeError(err error) bool {
if err == nil {
return false
}
return strings.Contains(err.Error(), "image is too large to encode")
}
func sendMergedImage(ctx *plugin.MessageContext, imageData []byte) error {
contentLength := int64(len(imageData))
if contentLength == 0 {
return nil
}
fmt.Printf("抖音图片合并后大小: %dMB\n", contentLength/1024/1024)
clientImgId := fmt.Sprintf("%v_%v", vars.RobotRuntime.WxID, time.Now().UnixNano())
chunkSize := vars.UploadImageChunkSize
totalChunks := int((contentLength + chunkSize - 1) / chunkSize)
for chunkIndex := range totalChunks {
start := int64(chunkIndex) * chunkSize
end := min(start+chunkSize, contentLength)
chunkData := imageData[start:end]
req := dto.SendImageMessageRequest{
ToWxid: ctx.Message.FromWxID,
ClientImgId: clientImgId,
FileSize: contentLength,
ChunkIndex: int64(chunkIndex),
TotalChunks: int64(totalChunks),
}
chunkReader := bytes.NewReader(chunkData)
chunkHeader := &multipart.FileHeader{
Filename: fmt.Sprintf("chunk_%d", chunkIndex),
Size: int64(len(chunkData)),
}
if _, err := ctx.MessageService.SendImageMessageStream(context.Background(), req, chunkReader, chunkHeader); err != nil {
return err
}
}
return nil
}
func sendFileByRemoteURL(ctx *plugin.MessageContext, fileURL string) error {
resp, err := resty.New().R().SetDoNotParseResponse(true).Get(fileURL)
if err != nil {
return fmt.Errorf("下载文件失败: %w", err)
}
defer resp.RawBody().Close()
if resp.StatusCode() != http.StatusOK {
return fmt.Errorf("下载文件失败HTTP状态码: %d", resp.StatusCode())
}
fileData, err := io.ReadAll(resp.RawBody())
if err != nil {
return fmt.Errorf("读取文件数据失败: %w", err)
}
if len(fileData) == 0 {
return fmt.Errorf("文件数据为空")
}
parsedURL, err := url.Parse(fileURL)
if err != nil {
return fmt.Errorf("解析文件URL失败: %w", err)
}
filename := path.Base(parsedURL.Path)
if filename == "" || filename == "/" || filename == "." {
filename = "douyin_music.mp3"
}
fileMD5Bytes := md5.Sum(fileData)
fileHash := hex.EncodeToString(fileMD5Bytes[:])
fileSize := int64(len(fileData))
chunkSize := vars.UploadFileChunkSize
if chunkSize <= 0 {
chunkSize = 200 * 1000
}
totalChunks := (fileSize + chunkSize - 1) / chunkSize
clientAppDataID := fmt.Sprintf("%v_%v", vars.RobotRuntime.WxID, time.Now().UnixNano())
for chunkIndex := range totalChunks {
start := int64(chunkIndex) * chunkSize
end := min(start+chunkSize, fileSize)
chunkData := fileData[start:end]
req := dto.SendFileMessageRequest{
ToWxid: ctx.Message.FromWxID,
ClientAppDataId: clientAppDataID,
Filename: filename,
FileHash: fileHash,
FileSize: fileSize,
ChunkIndex: int64(chunkIndex),
TotalChunks: totalChunks,
}
chunkReader := bytes.NewReader(chunkData)
chunkHeader := &multipart.FileHeader{
Filename: filename,
Size: int64(len(chunkData)),
}
if err = ctx.MessageService.SendFileMessage(context.Background(), req, chunkReader, chunkHeader); err != nil {
if strings.Contains(err.Error(), "context canceled") || strings.Contains(err.Error(), "context deadline exceeded") {
return fmt.Errorf("发送文件超时")
}
return err
}
}
return nil
}

2337
message.go Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,72 +0,0 @@
---
name: beauty
description: "当用户发送「999」时触发。调用美女图片接口获取图片链接再调用本地微信机器人发图接口把图片发给当前用户。"
argument-hint: "无需参数,直接调用即可"
---
# Beauty Skill
## 描述
这是一个用于获取美女图片并直接发送给当前用户的技能。
当用户发送 `999` 时,调用外部接口获取图片链接,再调用本地微信机器人接口把图片发出去。
这个仓库里额外提供了一个可执行脚本 `scripts/beauty.py`,方便宿主机器人直接调用。
## 触发条件
- 用户发送 `999`
## 接口信息
- 获取图片地址:`https://api.pearapi.ai/api/today_wife`
- 请求方式:`GET`
- 发图接口:`http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/image/url`
- 请求方式:`POST`
- 本地脚本:`scripts/beauty.py`
- 获取图片返回示例:
```json
{
"code": 200,
"msg": "获取成功",
"data": {
"image_url": "https://api.pearapi.ai/api_assets/wife/9a6a9c38-7d6e-464f-8930-eb9dac41cde9.webp",
"role_name": "初音未来、巡音流歌",
"width": 2480,
"height": 3508
},
"api_source": "官方API网:https://api.pearapi.ai/"
}
```
- 关键字段:`data.image_url`,表示需要发送出去的图片链接。
## 环境变量
- `ROBOT_WECHAT_CLIENT_PORT`:本地微信机器人服务端口。
- `ROBOT_FROM_WX_ID`:当前消息来源用户的 wxid。
## 执行步骤
1. 当用户发送 `999` 时触发该技能。
2. 在仓库根目录下执行本地脚本:`python3 scripts/beauty.py`。
3. 脚本内部发送 `GET` 请求到 `https://api.pearapi.ai/api/today_wife`
4. 脚本解析返回的 JSON并提取 `data.image_url`
5. 脚本从环境变量中读取 `ROBOT_WECHAT_CLIENT_PORT``ROBOT_FROM_WX_ID`
6. 脚本发送 `POST` 请求到 `http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/image/url`,请求体为:
```json
{
"to_wxid": "{ROBOT_FROM_WX_ID}",
"image_urls": ["image_url"]
}
```
7. 如果任一步骤失败,回复兜底文案:`今天的美女图片暂时没拿到,等我再找找。`
## 回复要求
- 成功时,直接发送图片,不要额外追加解释文字。
- 失败时,使用固定兜底文案回复。

View File

@ -1,88 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import os
import sys
import traceback
import urllib.error
import urllib.request
sys.stderr = sys.stdout
FETCH_API_URL = "https://api.pearapi.ai/api/today_wife"
FALLBACK_TEXT = "今天的美女图片暂时没拿到,等我再找找。"
def fetch_image_url() -> str | None:
try:
with urllib.request.urlopen(FETCH_API_URL, timeout=10) as response:
payload = json.load(response)
except (urllib.error.URLError, TimeoutError, json.JSONDecodeError):
return None
data = payload.get("data")
if not isinstance(data, dict):
return None
image_url = data.get("image_url")
if isinstance(image_url, str) and image_url.strip():
return image_url.strip()
return None
def send_image(image_url: str) -> bool:
robot_port = os.environ.get("ROBOT_WECHAT_CLIENT_PORT", "").strip()
to_wxid = os.environ.get("ROBOT_FROM_WX_ID", "").strip()
if not robot_port or not to_wxid:
return False
api_url = (
f"http://127.0.0.1:{robot_port}/api/v1/robot/message/send/image/url"
)
body = json.dumps(
{
"to_wxid": to_wxid,
"image_urls": [image_url],
}
).encode("utf-8")
request = urllib.request.Request(
api_url,
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(request, timeout=10) as response:
if 200 <= response.status < 300:
return True
payload = json.load(response)
except (urllib.error.URLError, TimeoutError, json.JSONDecodeError):
return False
code = payload.get("code")
return code == 200 or code == 0
def main() -> int:
image_url = fetch_image_url()
if image_url and send_image(image_url):
return 0
sys.stdout.write(FALLBACK_TEXT)
sys.stdout.write("\n")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,9 +0,0 @@
# 视频理解技能
**视频理解技能由豆包加持,使用本技能请将图片识别模型设置为豆包大模型**
需要额外注入豆包密钥
- ARK_API_KEY
以上环境变量,在界面上安装完本技能后,点击`环境变量`按钮设置

View File

@ -1,89 +0,0 @@
---
name: doubao-video-understanding
description: "豆包视频解析理解工具。当用户提供一个视频链接并希望获得视频的详细描述、总结或理解时使用。"
argument-hint: "需要 prompt、video_url可选 fps、max_tokens。"
---
# Doubao Video Understanding Skill
## 描述
这是一个 AI 视频解析理解技能,输入一个视频链接,输出视频的详细描述、总结,或对视频内容的理解。
脚本会先从数据库读取当前会话的图像 AI 配置开关,再读取对应的 `image_recognition_model` 作为理解模型,并使用环境变量中的 `ARK_API_KEY` 调用 Ark 多模态对话接口完成视频分析。
这个仓库里额外提供了一个可执行脚本 `scripts/video_understanding.py`,方便宿主机器人直接调用。
## 触发条件
- 用户发来一个视频链接,并要求描述视频内容。
- 用户说「总结这个视频」「帮我理解这个视频」「分析一下这个视频讲了什么」。
- 用户希望获取视频的详细描述、核心摘要、主题理解。
## 入参规范
```json
{
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "可选的分析指令。默认会要求模型输出详细描述、总结和理解。"
},
"video_url": {
"type": "string",
"description": "需要解析的视频链接,必须是 https 地址。"
},
"fps": {
"type": "integer",
"description": "抽帧频率,可选,默认 2。"
},
"max_tokens": {
"type": "integer",
"description": "模型输出最大 token 数,可选,默认 800。"
}
},
"required": ["prompt", "video_url"],
"additionalProperties": false
}
```
对应的命令行参数为:
- `--prompt <分析指令>` 必填
- `--video_url <视频链接>` 必填,必须是 `https` 地址
- `--fps <抽帧频率>` 可选
- `--max_tokens <最大输出 token 数>` 可选
## 依赖安装
- 脚本首次运行时会自动创建虚拟环境并安装依赖,无需手动执行。
- 如需手动重新安装,可执行:`python3 scripts/bootstrap.py`
## 执行步骤
1. 当用户提供视频链接并要求描述、总结或理解时触发该技能。
2. 提取 `prompt` 用户需求和 `video_url` 视频链接。可选提取 `fps`、`max_tokens`。
3. 在仓库根目录执行脚本,例如:
```bash
python3 scripts/video_understanding.py --prompt '请描述这个视频' --video_url 'https://example.com/demo.mp4'
```
4. 脚本会从数据库读取 `image_ai_enabled``image_recognition_model`。模型读取顺序为:当前会话覆盖配置优先,其次全局配置;如果表字段不存在,则回退到 `image_ai_settings` JSON 中的同名字段。
5. 脚本调用 `https://ark.cn-beijing.volces.com/api/v3/chat/completions`,将视频链接和分析指令一起发送给视觉模型。
6. 成功时,脚本输出文本结果,宿主机器人可直接作为消息回复给用户。
## 校验规则
- `prompt` 不能为空。
- `video_url` 不能为空,且必须是 `https` 链接。
- `fps` 必须大于 0。
- `max_tokens` 必须大于 0。
- 环境变量 `ARK_API_KEY` 必须存在。
- 数据库里必须开启图像 AI 能力,并能解析出 `image_recognition_model`
## 回复要求
- 成功时,脚本输出视频理解结果。
- 失败时,返回脚本输出的具体错误信息。

View File

@ -1,134 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import hashlib
import subprocess
import sys
import traceback
from pathlib import Path
sys.stderr = sys.stdout
def _skill_root_from(script_dir: Path) -> Path:
return script_dir.parent
def _venv_dir(script_dir: Path) -> Path:
return _skill_root_from(script_dir) / ".venv"
def _venv_python(venv_dir: Path) -> Path:
if sys.platform == "win32":
return venv_dir / "Scripts" / "python.exe"
return venv_dir / "bin" / "python"
def _stamp_file(venv_dir: Path) -> Path:
return venv_dir / ".req_hash"
def _file_hash(path: Path) -> str:
return hashlib.sha256(path.read_bytes()).hexdigest()
def _deps_up_to_date(requirements_file: Path, venv_dir: Path) -> bool:
stamp = _stamp_file(venv_dir)
if not stamp.is_file():
return False
return stamp.read_text().strip() == _file_hash(requirements_file)
def _write_stamp(requirements_file: Path, venv_dir: Path) -> None:
_stamp_file(venv_dir).write_text(_file_hash(requirements_file))
def _ensure_venv(venv_dir: Path, venv_python: Path) -> int:
if venv_python.is_file():
return 0
sys.stdout.write(f"未检测到技能虚拟环境,正在创建: {venv_dir}\n")
import shutil
py = sys.executable or next(
(shutil.which(c) for c in ("python3", "python") if shutil.which(c)), None
)
if not py:
raise RuntimeError("无法找到 Python 解释器路径")
command = [
py,
"-m",
"venv",
str(venv_dir),
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"创建虚拟环境失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
return 0
def main() -> int:
script_dir = Path(__file__).resolve().parent
requirements_file = script_dir / "requirements.txt"
venv_dir = _venv_dir(script_dir)
venv_python = _venv_python(venv_dir)
if not requirements_file.is_file():
sys.stdout.write(f"未找到依赖文件: {requirements_file}\n")
return 1
ensure_result = _ensure_venv(venv_dir, venv_python)
if ensure_result != 0:
return ensure_result
if _deps_up_to_date(requirements_file, venv_dir):
sys.stdout.write("依赖已是最新,跳过安装\n")
return 0
command = [
str(venv_python),
"-m",
"pip",
"install",
"--upgrade",
"pip",
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"升级 pip 失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
command = [
str(venv_python),
"-m",
"pip",
"install",
"-r",
str(requirements_file),
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"安装依赖失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
_write_stamp(requirements_file, venv_dir)
sys.stdout.write(f"依赖安装完成,当前技能虚拟环境: {venv_dir}\n")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,2 +0,0 @@
cryptography
pymysql>=1.1,<2

View File

@ -1,365 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import os
import subprocess
import sys
import traceback
import urllib.error
import urllib.request
from pathlib import Path
from urllib.parse import urlparse
sys.stderr = sys.stdout
DEFAULT_PROMPT = "请用中文输出分成三部分1. 详细描述视频内容2. 总结核心信息3. 给出对视频的理解。"
DEFAULT_FPS = 2
DEFAULT_MAX_TOKENS = 800
def _skill_root() -> Path:
return Path(__file__).resolve().parent.parent
def _skill_venv_python() -> Path:
venv_dir = _skill_root() / ".venv"
if sys.platform == "win32":
return venv_dir / "Scripts" / "python.exe"
return venv_dir / "bin" / "python"
def _get_python_executable() -> str:
if sys.executable:
return sys.executable
import shutil
for candidate in ("python3", "python"):
found = shutil.which(candidate)
if found:
return found
raise RuntimeError("无法找到 Python 解释器路径")
def _run_bootstrap() -> None:
bootstrap = Path(__file__).resolve().parent / "bootstrap.py"
result = subprocess.run([_get_python_executable(), str(bootstrap)])
if result.returncode != 0:
raise SystemExit(result.returncode)
def _ensure_skill_venv_python() -> None:
venv_python = _skill_venv_python()
if not venv_python.is_file():
_run_bootstrap()
venv_python = _skill_venv_python()
if not venv_python.is_file():
sys.stdout.write("bootstrap 后仍未找到虚拟环境\n")
raise SystemExit(1)
venv_dir = _skill_root() / ".venv"
if Path(sys.prefix) == venv_dir.resolve():
return
os.execv(str(venv_python), [str(venv_python), str(Path(__file__).resolve()), *sys.argv[1:]])
_ensure_skill_venv_python()
try:
import pymysql # type: ignore # noqa: E402
except ModuleNotFoundError:
_run_bootstrap()
_py = _get_python_executable()
os.execv(_py, [_py, str(Path(__file__).resolve()), *sys.argv[1:]])
def _mysql_connect():
host = os.environ.get("MYSQL_HOST", "127.0.0.1")
port = int(os.environ.get("MYSQL_PORT", "3306"))
user = os.environ.get("MYSQL_USER", "root")
password = os.environ.get("MYSQL_PASSWORD", "")
database = os.environ.get("ROBOT_CODE", "")
if not database:
raise RuntimeError("环境变量 ROBOT_CODE 未配置")
return pymysql.connect(
host=host,
port=port,
user=user,
password=password,
database=database,
charset="utf8mb4",
connect_timeout=10,
read_timeout=30,
)
def _query_one(conn, sql: str, params: tuple = ()) -> dict | None:
cur = conn.cursor()
cur.execute(sql, params)
columns = [desc[0] for desc in cur.description] if cur.description else []
row = cur.fetchone()
cur.close()
if row is None:
return None
return dict(zip(columns, row))
def _table_has_column(conn, table_name: str, column_name: str) -> bool:
sql = (
"SELECT 1 FROM information_schema.columns "
"WHERE table_schema = %s AND table_name = %s AND column_name = %s LIMIT 1"
)
database_name = conn.db
if isinstance(database_name, (bytes, bytearray)):
database_name = database_name.decode("utf-8")
cur = conn.cursor()
cur.execute(sql, (database_name, table_name, column_name))
row = cur.fetchone()
cur.close()
return row is not None
def _decode_settings(raw: object) -> dict:
if not raw:
return {}
if isinstance(raw, (bytes, bytearray)):
raw = raw.decode("utf-8")
if isinstance(raw, str) and raw.strip():
return json.loads(raw)
return {}
def _extract_model(record: dict | None, settings_json: dict) -> str:
if record:
model = record.get("image_recognition_model")
if isinstance(model, (bytes, bytearray)):
model = model.decode("utf-8")
if isinstance(model, str) and model.strip():
return model.strip()
for key in ("image_recognition_model", "imageRecognitionModel"):
value = settings_json.get(key)
if isinstance(value, str) and value.strip():
return value.strip()
return ""
def load_understanding_settings(conn, from_wx_id: str) -> tuple[bool, str]:
global_has_model = _table_has_column(conn, "global_settings", "image_recognition_model")
chatroom_has_model = _table_has_column(conn, "chat_room_settings", "image_recognition_model")
friend_has_model = _table_has_column(conn, "friend_settings", "image_recognition_model")
global_fields = "image_ai_enabled, image_ai_settings"
if global_has_model:
global_fields += ", image_recognition_model"
global_record = _query_one(conn, f"SELECT {global_fields} FROM global_settings LIMIT 1")
enabled = False
settings_json: dict = {}
model = ""
if global_record:
if global_record.get("image_ai_enabled") is not None:
enabled = bool(global_record["image_ai_enabled"])
settings_json = _decode_settings(global_record.get("image_ai_settings"))
model = _extract_model(global_record, settings_json)
if from_wx_id.endswith("@chatroom"):
override_fields = "image_ai_enabled, image_ai_settings"
if chatroom_has_model:
override_fields += ", image_recognition_model"
override = _query_one(
conn,
f"SELECT {override_fields} FROM chat_room_settings WHERE chat_room_id = %s LIMIT 1",
(from_wx_id,),
)
else:
override_fields = "image_ai_enabled, image_ai_settings"
if friend_has_model:
override_fields += ", image_recognition_model"
override = _query_one(
conn,
f"SELECT {override_fields} FROM friend_settings WHERE wechat_id = %s LIMIT 1",
(from_wx_id,),
)
if override:
if override.get("image_ai_enabled") is not None:
enabled = bool(override["image_ai_enabled"])
override_settings = _decode_settings(override.get("image_ai_settings"))
if override_settings:
settings_json = override_settings
override_model = _extract_model(override, settings_json)
if override_model:
model = override_model
return enabled, model
def _http_post_json(url: str, body: dict, headers: dict, timeout: int = 300) -> dict:
data = json.dumps(body).encode("utf-8")
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
error_body = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"HTTP {exc.code}: {error_body}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(str(exc)) from exc
def _extract_response_text(payload: dict) -> str:
choices = payload.get("choices", [])
if not choices:
return ""
message = choices[0].get("message", {})
content = message.get("content", "")
if isinstance(content, str):
return content.strip()
if isinstance(content, list):
texts: list[str] = []
for item in content:
if not isinstance(item, dict):
continue
if item.get("type") == "text" and isinstance(item.get("text"), str):
texts.append(item["text"].strip())
return "\n".join(text for text in texts if text)
return ""
def analyze_video(video_url: str, prompt: str, model: str, fps: int, max_tokens: int) -> str:
api_key = os.environ.get("ARK_API_KEY", "").strip()
if not api_key:
raise RuntimeError("环境变量 ARK_API_KEY 未配置")
if not model:
raise RuntimeError("数据库中未配置 image_recognition_model")
body = {
"model": model,
"messages": [
{
"role": "user",
"content": [
{"type": "video_url", "video_url": {"url": video_url}, "fps": str(fps)},
{"type": "text", "text": prompt},
],
}
],
"max_tokens": max_tokens,
}
response = _http_post_json(
"https://ark.cn-beijing.volces.com/api/v3/chat/completions",
body,
{"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
timeout=300,
)
text = _extract_response_text(response)
if not text:
raise RuntimeError("视频理解接口未返回文本内容")
return text
def _validate_video_url(value: str) -> str:
parsed = urlparse(value)
if parsed.scheme != "https" or not parsed.netloc:
raise ValueError("video_url 必须是 https 链接")
return value
def _parse_cli_params(argv: list[str]) -> dict:
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("--video_url", default="")
parser.add_argument("--prompt", default=DEFAULT_PROMPT)
parser.add_argument("--fps", type=int, default=DEFAULT_FPS)
parser.add_argument("--max_tokens", type=int, default=DEFAULT_MAX_TOKENS)
namespace, unknown = parser.parse_known_args(argv)
if unknown:
raise ValueError(f"存在不支持的参数: {' '.join(unknown)}")
if namespace.fps <= 0:
raise ValueError("fps 必须大于 0")
if namespace.max_tokens <= 0:
raise ValueError("max_tokens 必须大于 0")
return {
"video_url": namespace.video_url,
"prompt": namespace.prompt,
"fps": namespace.fps,
"max_tokens": namespace.max_tokens,
}
def main() -> int:
if len(sys.argv) < 2:
sys.stdout.write("缺少输入参数\n")
return 1
try:
params = _parse_cli_params(sys.argv[1:])
except ValueError as exc:
sys.stdout.write(f"参数格式错误: {exc}\n")
return 1
video_url = params.get("video_url", "").strip()
if not video_url:
sys.stdout.write("缺少视频链接\n")
return 1
try:
_validate_video_url(video_url)
except ValueError as exc:
sys.stdout.write(f"参数格式错误: {exc}\n")
return 1
prompt = params.get("prompt", "").strip() or DEFAULT_PROMPT
fps = int(params.get("fps", DEFAULT_FPS))
max_tokens = int(params.get("max_tokens", DEFAULT_MAX_TOKENS))
from_wx_id = os.environ.get("ROBOT_FROM_WX_ID", "").strip()
if not from_wx_id:
sys.stdout.write("环境变量 ROBOT_FROM_WX_ID 未配置\n")
return 1
try:
conn = _mysql_connect()
except Exception as exc:
sys.stdout.write(f"数据库连接失败: {exc}\n")
return 1
try:
enabled, model = load_understanding_settings(conn, from_wx_id)
except Exception as exc:
sys.stdout.write(f"加载视频理解配置失败: {exc}\n")
return 1
finally:
try:
conn.close()
except Exception:
pass
if not enabled:
sys.stdout.write("AI 图像识别未开启\n")
return 0
try:
content = analyze_video(video_url, prompt, model, fps, max_tokens)
except Exception as exc:
sys.stdout.write(f"调用视频理解接口失败: {exc}\n")
return 1
sys.stdout.write(f"{content}\n")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,53 +0,0 @@
---
name: douyin-video-parse
description: "当用户发送包含抖音短链接https://v.douyin.com/xxx的消息时触发。自动解析抖音视频/图片,并发送给当前用户。"
argument-hint: "消息中包含抖音短链接即可自动触发"
---
# Douyin Video Parse Skill
## 描述
这是一个用于解析抖音短视频/图片的技能。
当用户发送的消息中包含 `https://v.douyin.com/` 链接时,自动解析该链接对应的视频或图片,并通过本地微信机器人接口发送给当前用户。
这个仓库里额外提供了一个可执行脚本 `scripts/douyin_video_parse.py`,方便宿主机器人直接调用。
## 触发条件
- 用户消息中包含 `https://v.douyin.com/` 链接
## 解析原理
1. 访问抖音短链接,跟随 302 重定向获取真实页面 URL
2. 请求真实页面 HTML从中提取 `window._ROUTER_DATA` JSON 数据
3. 从 JSON 中解析出视频播放地址或图片列表
4. 通过本地微信机器人接口发送视频或图片
## 环境变量
- `ROBOT_WECHAT_CLIENT_PORT`:本地微信机器人服务端口。
- `ROBOT_FROM_WX_ID`:当前消息来源用户的 wxid。
- `ROBOT_MESSAGE_CONTENT`:用户发送的原始消息内容(用于提取抖音链接)。
## 执行步骤
1. 当用户消息中包含 `https://v.douyin.com/` 链接时触发该技能。
2. 在仓库根目录下执行本地脚本:`python3 scripts/douyin_video_parse.py`。
3. 脚本从环境变量 `ROBOT_MESSAGE_CONTENT` 中提取抖音短链接。
4. 脚本访问短链接,跟随重定向获取真实页面 URL。
5. 脚本请求真实页面,解析 `window._ROUTER_DATA` 中的视频/图片信息。
6. 如果是视频:
- 先发送分享卡片链接
- 再调用 `POST http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/video/url` 发送视频
7. 如果是图片:
- 发送文字提示(作者、标题、图片数量)
- 调用 `POST http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/image/url` 逐张发送图片
8. 如果解析失败,回复兜底文案:`抖音解析失败,可能是链接已失效或格式不正确。`
## 回复要求
- 视频类型:发送视频文件,附带作者和标题信息。
- 图片类型:发送所有图片,附带作者和标题信息。
- 失败时,使用固定兜底文案回复。

View File

@ -1,345 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import html
import json
import os
import re
import sys
import traceback
import urllib.error
import urllib.parse
import urllib.request
sys.stderr = sys.stdout
DOUYIN_USER_AGENT = (
"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) "
"AppleWebKit/605.1.15 (KHTML, like Gecko) "
"Version/14.0 Mobile/15E148 Safari/604.1"
)
DOUYIN_REFERER = "https://www.douyin.com/"
FALLBACK_TEXT = "抖音解析失败,可能是链接已失效或格式不正确。"
ROUTER_DATA_RE = re.compile(r"(?s)window\._ROUTER_DATA\s*=\s*(\{.*?\})\s*</script>")
DOUYIN_URL_RE = re.compile(r"https://[^\s]+")
def build_request(url: str) -> urllib.request.Request:
return urllib.request.Request(
url,
headers={
"User-Agent": DOUYIN_USER_AGENT,
"Referer": DOUYIN_REFERER,
},
)
def resolve_redirect(short_url: str) -> str | None:
"""Follow the 302 redirect to get the real page URL."""
class NoRedirectHandler(urllib.request.HTTPRedirectHandler):
def redirect_request(self, req, fp, code, msg, headers, newurl):
return None
opener = urllib.request.build_opener(NoRedirectHandler)
req = build_request(short_url)
try:
response = opener.open(req, timeout=15)
return response.url
except urllib.error.HTTPError as e:
location = e.headers.get("Location")
if location:
return location
return None
except (urllib.error.URLError, TimeoutError):
return None
def fetch_page_html(page_url: str) -> str | None:
"""Fetch the Douyin page HTML content."""
req = build_request(page_url)
try:
with urllib.request.urlopen(req, timeout=15) as response:
if response.status != 200:
return None
return response.read().decode("utf-8", errors="replace")
except (urllib.error.URLError, TimeoutError):
return None
def decode_escaped_value(value: str) -> str:
"""Decode HTML entities and JSON escape sequences."""
decoded = html.unescape(value)
if "\\" in decoded:
try:
unquoted = json.loads('"' + decoded.replace('"', '\\"') + '"')
decoded = unquoted
except (json.JSONDecodeError, ValueError):
pass
return html.unescape(decoded)
def pick_preferred_url(urls: list[str]) -> str:
"""Pick the best URL from a list, preferring p26 CDN."""
first_url = ""
for raw_url in urls:
if not raw_url:
continue
decoded_url = decode_escaped_value(raw_url)
if not decoded_url:
continue
if decoded_url.startswith("https://p26"):
return decoded_url
if not first_url:
first_url = decoded_url
return first_url
def pick_video_url(urls: list[str]) -> str:
"""Pick the best video URL, preferring aweme.snssdk.com."""
decoded_urls = []
for raw_url in urls:
if not raw_url:
continue
decoded_url = decode_escaped_value(raw_url).replace("playwm", "play")
decoded_urls.append(decoded_url)
for url in decoded_urls:
if "aweme.snssdk.com" in url:
return url
return decoded_urls[0] if decoded_urls else ""
def extract_aweme_item(html_content: str) -> dict | None:
"""Extract the first aweme item from _ROUTER_DATA."""
match = ROUTER_DATA_RE.search(html_content)
if not match:
return None
try:
router_data = json.loads(match.group(1))
except json.JSONDecodeError:
return None
loader_data = router_data.get("loaderData", {})
for page_data in loader_data.values():
if not isinstance(page_data, dict):
continue
video_info_res = page_data.get("videoInfoRes", {})
item_list = video_info_res.get("item_list", [])
if item_list:
return item_list[0]
return None
def parse_note_item(item: dict) -> dict | None:
"""Parse image/note type content."""
images = item.get("images") or item.get("image_infos") or []
if not images:
return None
image_urls = []
seen = set()
for img_info in images:
url_list = img_info.get("url_list", [])
for url in url_list:
if url and url.startswith("http"):
decoded = html.unescape(url)
if decoded not in seen:
image_urls.append(decoded)
seen.add(decoded)
break
if not image_urls:
return None
author = item.get("author", {})
music = item.get("music", {})
music_url = pick_preferred_url(music.get("play_url", {}).get("url_list", []))
# Fallback music URL from video play_addr
if not music_url:
video = item.get("video", {})
play_addr = video.get("play_addr", {})
uri = play_addr.get("uri", "")
if uri.startswith("http"):
music_url = decode_escaped_value(uri)
else:
music_url = pick_preferred_url(play_addr.get("url_list", []))
return {
"type": "note",
"author": html.unescape(author.get("nickname", "")),
"title": html.unescape(item.get("desc", "")),
"images": image_urls,
"music_url": music_url,
}
def parse_video_item(item: dict) -> dict | None:
"""Parse video type content."""
video = item.get("video", {})
duration = video.get("duration")
if duration is not None and duration == 0:
return None
play_addr = video.get("play_addr", {})
video_url = pick_video_url(play_addr.get("url_list", []))
if not video_url:
return None
author = item.get("author", {})
return {
"type": "video",
"author": html.unescape(author.get("nickname", "")),
"title": html.unescape(item.get("desc", "")),
"url": video_url,
"cover": pick_preferred_url(video.get("cover", {}).get("url_list", [])),
}
def parse_douyin(short_url: str) -> dict | None:
"""Main parsing logic: resolve redirect -> fetch HTML -> extract data."""
resolved_url = resolve_redirect(short_url)
if not resolved_url:
return None
html_content = fetch_page_html(resolved_url)
if not html_content:
return None
item = extract_aweme_item(html_content)
if not item:
return None
# Try note (images) first, then video
result = parse_note_item(item)
if result:
return result
result = parse_video_item(item)
if result:
return result
return None
def send_video(video_url: str, robot_port: str, to_wxid: str) -> bool:
"""Send video via local robot API."""
api_url = f"http://127.0.0.1:{robot_port}/api/v1/robot/message/send/video/url"
body = json.dumps({
"to_wxid": to_wxid,
"video_urls": [video_url],
}).encode("utf-8")
request = urllib.request.Request(
api_url,
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(request, timeout=60) as response:
return 200 <= response.status < 300
except (urllib.error.URLError, TimeoutError):
return False
def send_images(image_urls: list[str], robot_port: str, to_wxid: str) -> bool:
"""Send images via local robot API."""
api_url = f"http://127.0.0.1:{robot_port}/api/v1/robot/message/send/image/url"
body = json.dumps({
"to_wxid": to_wxid,
"image_urls": image_urls,
}).encode("utf-8")
request = urllib.request.Request(
api_url,
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(request, timeout=60) as response:
return 200 <= response.status < 300
except (urllib.error.URLError, TimeoutError):
return False
def send_text(text: str, robot_port: str, to_wxid: str) -> bool:
"""Send text message via local robot API."""
api_url = f"http://127.0.0.1:{robot_port}/api/v1/robot/message/send/text"
body = json.dumps({
"to_wxid": to_wxid,
"content": text,
}).encode("utf-8")
request = urllib.request.Request(
api_url,
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(request, timeout=10) as response:
return 200 <= response.status < 300
except (urllib.error.URLError, TimeoutError):
return False
def main() -> int:
robot_port = os.environ.get("ROBOT_WECHAT_CLIENT_PORT", "").strip()
to_wxid = os.environ.get("ROBOT_FROM_WX_ID", "").strip()
message_content = os.environ.get("ROBOT_MESSAGE_CONTENT", "").strip()
if not robot_port or not to_wxid or not message_content:
sys.stdout.write(FALLBACK_TEXT + "\n")
return 0
# Extract douyin URL from message
matches = DOUYIN_URL_RE.findall(message_content)
douyin_urls = [u for u in matches if "v.douyin.com" in u]
if not douyin_urls:
sys.stdout.write(FALLBACK_TEXT + "\n")
return 0
douyin_url = douyin_urls[0]
result = parse_douyin(douyin_url)
if not result:
sys.stdout.write(FALLBACK_TEXT + "\n")
return 0
if result["type"] == "video":
# Send info text
info_text = f"抖音视频解析成功\n作者: {result['author']}\n标题: {result['title']}"
send_text(info_text, robot_port, to_wxid)
# Send video
if not send_video(result["url"], robot_port, to_wxid):
sys.stdout.write("发送抖音视频失败,请稍后重试。\n")
return 0
elif result["type"] == "note":
# Send info text
info_text = (
f"抖音图片解析成功\n"
f"作者: {result['author']}\n"
f"标题: {result['title']}\n\n"
f"{len(result['images'])}张图片正在发送中..."
)
send_text(info_text, robot_port, to_wxid)
# Send images
if not send_images(result["images"], robot_port, to_wxid):
sys.stdout.write("发送抖音图片失败,请稍后重试。\n")
return 0
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,104 +0,0 @@
---
name: image-to-image
description: "图片修改、图生图工具。基于输入的一张或多张图片,结合文本提示词生成新的图片。支持图片混合、风格转换、内容合成等多种创作模式。输入是文字+图片的组合,输出是图片。"
argument-hint: "需要 prompt提示词和 images图片链接列表可选 model模型、negative_prompt反向提示词、ratio宽高比、resolution分辨率"
---
# Image To Image Skill
## 描述
这是一个 AI 图生图技能,基于输入的一张或多张图片,结合文本提示词生成新的图片。支持图片混合、风格转换、内容合成等多种创作模式。
支持多个绘图模型即梦JiMeng、豆包DouBao、造相Z-Image、OpenAI GPT Image。
从数据库中读取绘图配置API 密钥、Base URL 等),根据用户选择的模型调用对应的绘图 API返回生成的图片 URL。
这个仓库里额外提供了一个可执行脚本 `scripts/image_to_image.py`,方便宿主机器人直接调用。
## 触发条件
- 用户想基于图片生成新图片
- 用户说「把这张图变成……」「把图片修改成……」「风格转换」「图片合成」
- 用户提到「图生图」「图片编辑」「图片修改」
- 用户发送了一张或多张图片,并附带修改、合成、风格转换等描述
## 参数说明JSON Schema
调用脚本时,需要通过 shell 风格参数传入,参数结构如下:
```json
{
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "根据用户输入的文本内容,提取出图片混合、风格转换、内容合成等等的提示词,但是不要对提示词进行修改。"
},
"model": {
"type": "string",
"description": "画图模型选择可选即梦4.5(jimeng-4.5) / 即梦4.6(jimeng-4.6) / 即梦4.7(jimeng-4.7) / 即梦5.0(jimeng-5.0) / 豆包图生图(doubao-seededit-3.0-i2i) / 造相基础版(Z-Image) / 造相蒸馏版(Z-Image-Turbo) / 造相图片编辑(Qwen-Image-Edit-2511) / OpenAI GPT Image(gpt-image-2),默认: 空(none)。",
"enum": [
"none",
"jimeng-4.5",
"jimeng-4.6",
"jimeng-4.7",
"jimeng-5.0",
"doubao-seededit-3.0-i2i",
"Z-Image",
"Z-Image-Turbo",
"Qwen-Image-Edit-2511",
"gpt-image-2"
],
"default": "none"
},
"images": {
"type": "array",
"items": { "type": "string" },
"description": "用于图片编辑、图片混合、风格转换、内容合成等的图片链接列表,至少需要一张图像。"
},
"negative_prompt": {
"type": "string",
"description": "用于描述图像中不希望出现的元素或特征的文本,可选。"
},
"ratio": {
"type": "string",
"description": "图像的宽高比可选默认16:9。",
"default": "16:9"
},
"resolution": {
"type": "string",
"description": "图像的分辨率可选默认2k。",
"default": "2k"
}
},
"required": ["prompt", "images"],
"additionalProperties": false
}
```
对应的命令行参数为:
- `--prompt <提示词>` 必填
- `--images <图片链接>` 必填,可重复传入多张图片,如 `--images url1 --images url2`
- `--model <模型名>` 可选
- `--negative_prompt <反向提示词>` 可选
- `--ratio <宽高比>` 可选
- `--resolution <分辨率>` 可选
## 依赖安装
- 脚本首次运行时会自动创建虚拟环境并安装依赖,无需手动执行。
- 如需手动重新安装,可执行:`python3 scripts/bootstrap.py`
## 执行步骤
1. 当用户发送图片并附带修改、合成、风格转换等描述时触发该技能。
2. 从用户输入中提取 prompt提示词不对提示词做总结或修改。提取 images图片链接列表。可选提取 model、negative_prompt、ratio、resolution 参数。
3. 将参数组装为 shell 风格命令行参数,在仓库根目录下执行本地脚本,例如:`python3 scripts/image_to_image.py --prompt '把这张图变成油画风格' --images 'https://example.com/img1.jpg' --images 'https://example.com/img2.jpg' --model jimeng-5.0`。
4. 脚本生成图片后会自动调用客户端接口 `POST http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/image/url` 将图片发送给用户,成功时输出「图片发送成功」。
## 回复要求
- 成功时,脚本输出「图片发送成功」,表示图片已通过客户端接口直接发送,无需 AI 智能体再做额外处理。
- 失败时,返回具体的失败信息。

View File

@ -1,133 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import hashlib
import subprocess
import sys
import traceback
from pathlib import Path
sys.stderr = sys.stdout
def _skill_root_from(script_dir: Path) -> Path:
return script_dir.parent
def _venv_dir(script_dir: Path) -> Path:
return _skill_root_from(script_dir) / ".venv"
def _venv_python(venv_dir: Path) -> Path:
if sys.platform == "win32":
return venv_dir / "Scripts" / "python.exe"
return venv_dir / "bin" / "python"
def _stamp_file(venv_dir: Path) -> Path:
return venv_dir / ".req_hash"
def _file_hash(path: Path) -> str:
return hashlib.sha256(path.read_bytes()).hexdigest()
def _deps_up_to_date(requirements_file: Path, venv_dir: Path) -> bool:
stamp = _stamp_file(venv_dir)
if not stamp.is_file():
return False
return stamp.read_text().strip() == _file_hash(requirements_file)
def _write_stamp(requirements_file: Path, venv_dir: Path) -> None:
_stamp_file(venv_dir).write_text(_file_hash(requirements_file))
def _ensure_venv(venv_dir: Path, venv_python: Path) -> int:
if venv_python.is_file():
return 0
sys.stdout.write(f"未检测到技能虚拟环境,正在创建: {venv_dir}\n")
import shutil
py = sys.executable or next(
(shutil.which(c) for c in ("python3", "python") if shutil.which(c)), None
)
if not py:
raise RuntimeError("无法找到 Python 解释器路径")
command = [
py,
"-m",
"venv",
str(venv_dir),
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"创建虚拟环境失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
return 0
def main() -> int:
script_dir = Path(__file__).resolve().parent
requirements_file = script_dir / "requirements.txt"
venv_dir = _venv_dir(script_dir)
venv_python = _venv_python(venv_dir)
if not requirements_file.is_file():
sys.stdout.write(f"未找到依赖文件: {requirements_file}\n")
return 1
ensure_result = _ensure_venv(venv_dir, venv_python)
if ensure_result != 0:
return ensure_result
if _deps_up_to_date(requirements_file, venv_dir):
sys.stdout.write("依赖已是最新,跳过安装\n")
return 0
command = [
str(venv_python),
"-m",
"pip",
"install",
"--upgrade",
"pip",
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"升级 pip 失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
command = [
str(venv_python),
"-m",
"pip",
"install",
"-r",
str(requirements_file),
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"安装依赖失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
_write_stamp(requirements_file, venv_dir)
sys.stdout.write(f"依赖安装完成,当前技能虚拟环境: {venv_dir}\n")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,751 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import base64
import json
import mimetypes
import os
import re
import subprocess
import sys
import tempfile
import time
import traceback
import urllib.parse
import urllib.request
from pathlib import Path
# The skill runner consumes stdout, so route Python error output there as well.
sys.stderr = sys.stdout
def _skill_root() -> Path:
script_dir = Path(__file__).resolve().parent
return script_dir.parent
def _skill_venv_python() -> Path:
venv_dir = _skill_root() / ".venv"
if sys.platform == "win32":
return venv_dir / "Scripts" / "python.exe"
return venv_dir / "bin" / "python"
def _get_python_executable() -> str:
if sys.executable:
return sys.executable
import shutil
for candidate in ("python3", "python"):
found = shutil.which(candidate)
if found:
return found
raise RuntimeError("无法找到 Python 解释器路径")
def _run_bootstrap() -> None:
bootstrap = Path(__file__).resolve().parent / "bootstrap.py"
result = subprocess.run([_get_python_executable(), str(bootstrap)])
if result.returncode != 0:
raise SystemExit(result.returncode)
def _ensure_skill_venv_python() -> None:
venv_python = _skill_venv_python()
if not venv_python.is_file():
_run_bootstrap()
venv_python = _skill_venv_python()
if not venv_python.is_file():
sys.stdout.write("bootstrap 后仍未找到虚拟环境\n")
raise SystemExit(1)
venv_dir = _skill_root() / ".venv"
if Path(sys.prefix) == venv_dir.resolve():
return
os.execv(str(venv_python), [str(venv_python), str(Path(__file__).resolve()), *sys.argv[1:]])
_ensure_skill_venv_python()
try:
import pymysql # type: ignore # noqa: E402
from openai import OpenAI # type: ignore # noqa: E402
except ModuleNotFoundError:
_run_bootstrap()
_py = _get_python_executable()
os.execv(_py, [_py, str(Path(__file__).resolve()), *sys.argv[1:]])
# ---------------------------------------------------------------------------
# Database helpers
# ---------------------------------------------------------------------------
def _mysql_connect():
host = os.environ.get("MYSQL_HOST", "127.0.0.1")
port = int(os.environ.get("MYSQL_PORT", "3306"))
user = os.environ.get("MYSQL_USER", "root")
password = os.environ.get("MYSQL_PASSWORD", "")
database = os.environ.get("ROBOT_CODE", "")
if not database:
raise RuntimeError("环境变量 ROBOT_CODE 未配置")
return pymysql.connect(
host=host, port=port, user=user, password=password,
database=database, charset="utf8mb4",
connect_timeout=10, read_timeout=30,
)
def _query_one(conn, sql: str, params: tuple = ()) -> dict | None:
cur = conn.cursor()
cur.execute(sql, params)
columns = [desc[0] for desc in cur.description] if cur.description else []
row = cur.fetchone()
cur.close()
if row is None:
return None
return dict(zip(columns, row))
# ---------------------------------------------------------------------------
# Settings resolution (mirrors the Go service logic)
# ---------------------------------------------------------------------------
def load_drawing_settings(conn, from_wx_id: str) -> tuple[bool, dict]:
"""Return (enabled, image_ai_settings_dict)."""
gs = _query_one(conn, "SELECT image_ai_enabled, image_ai_settings FROM global_settings LIMIT 1")
enabled = False
settings_json: dict = {}
if gs:
if gs.get("image_ai_enabled"):
enabled = bool(gs["image_ai_enabled"])
raw = gs.get("image_ai_settings")
if raw:
if isinstance(raw, (bytes, bytearray)):
raw = raw.decode("utf-8")
if isinstance(raw, str) and raw.strip():
settings_json = json.loads(raw)
if from_wx_id.endswith("@chatroom"):
override = _query_one(
conn,
"SELECT image_ai_enabled, image_ai_settings FROM chat_room_settings WHERE chat_room_id = %s LIMIT 1",
(from_wx_id,),
)
else:
override = _query_one(
conn,
"SELECT image_ai_enabled, image_ai_settings FROM friend_settings WHERE wechat_id = %s LIMIT 1",
(from_wx_id,),
)
if override:
if override.get("image_ai_enabled") is not None:
enabled = bool(override["image_ai_enabled"])
raw = override.get("image_ai_settings")
if raw:
if isinstance(raw, (bytes, bytearray)):
raw = raw.decode("utf-8")
if isinstance(raw, str) and raw.strip():
settings_json = json.loads(raw)
return enabled, settings_json
# ---------------------------------------------------------------------------
# API callers
# ---------------------------------------------------------------------------
def _http_post_json(url: str, body: dict, headers: dict, timeout: int = 300) -> dict:
data = json.dumps(body).encode("utf-8")
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode("utf-8"))
def _http_get_json(url: str, headers: dict, timeout: int = 30) -> dict:
req = urllib.request.Request(url, headers=headers, method="GET")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode("utf-8"))
def _coerce_int(value, default: int, minimum: int, maximum: int) -> int:
try:
parsed = int(value)
except (TypeError, ValueError):
parsed = default
return min(max(parsed, minimum), maximum)
def _openai_output_format(config: dict) -> str:
output_format = str(config.get("output_format", "png") or "png").lower()
if output_format not in {"png", "jpeg", "webp"}:
return "png"
return output_format
def _openai_size(config: dict, ratio: str, resolution: str) -> str:
configured = str(config.get("size", "") or "").strip()
if configured:
return configured
normalized_ratio = (ratio or "").replace(" ", "").lower()
normalized_resolution = (resolution or "").replace(" ", "").lower()
if normalized_resolution in {"4k", "2160p", "3840x2160"}:
sizes = {
"16:9": "3840x2160",
"9:16": "2160x3840",
"1:1": "2048x2048",
"3:2": "3072x2048",
"2:3": "2048x3072",
}
elif normalized_resolution in {"2k", "1440p", "2048"}:
sizes = {
"16:9": "2048x1152",
"9:16": "1152x2048",
"1:1": "2048x2048",
"3:2": "2048x1360",
"2:3": "1360x2048",
}
elif normalized_resolution in {"1k", "1024", "1024p"}:
sizes = {
"16:9": "1536x864",
"9:16": "864x1536",
"1:1": "1024x1024",
"3:2": "1536x1024",
"2:3": "1024x1536",
}
else:
return "auto"
return sizes.get(normalized_ratio, "auto")
def _openai_prompt(prompt: str, negative_prompt: str) -> str:
if not negative_prompt:
return prompt
return f"{prompt}\n\n不要包含: {negative_prompt}"
def _openai_client(config: dict) -> OpenAI:
api_key = str(config.get("api_key", "")).strip()
if not api_key:
raise RuntimeError("OpenAI 绘图配置缺少 api_key")
base_url = str(config.get("base_url", "") or "").strip()
organization = str(config.get("organization", "") or "").strip()
project = str(config.get("project", "") or "").strip()
timeout: float | None = None
timeout_value = config.get("timeout")
if timeout_value not in (None, ""):
timeout = float(timeout_value)
return OpenAI(
api_key=api_key,
base_url=base_url or None,
organization=organization or None,
project=project or None,
timeout=timeout,
)
def _truncate_debug_payload(value):
if isinstance(value, dict):
return {
key: (
f"{item[:50]}..." if key == "b64_json" and isinstance(item, str) and len(item) > 50 else _truncate_debug_payload(item)
)
for key, item in value.items()
}
if isinstance(value, list):
return [_truncate_debug_payload(item) for item in value]
return value
def _debug_response(label: str, payload) -> None:
if hasattr(payload, "model_dump"):
payload = payload.model_dump()
payload = _truncate_debug_payload(payload)
sys.stdout.write(f"[debug] {label}: {json.dumps(payload, ensure_ascii=False)}\n")
def _rewrite_openai_image_url(url: str) -> str:
internal_host = "http://chatgpt2api:80"
external_host = "https://chatgpt2api.houhoukang.com"
if url.startswith(internal_host):
return f"{external_host}{url[len(internal_host):]}"
return url
def _extension_from_output_format(output_format: str) -> str:
if output_format == "jpeg":
return ".jpg"
if output_format == "webp":
return ".webp"
return ".png"
def _openai_response_value(item, key: str):
if isinstance(item, dict):
return item.get(key)
return getattr(item, key, None)
def _write_openai_b64_image(b64_json: str, output_format: str) -> str:
encoded = b64_json.strip()
suffix = _extension_from_output_format(output_format)
if encoded.startswith("data:"):
header, encoded = encoded.split(",", 1)
mime_type = header[5:].split(";", 1)[0].strip().lower()
if mime_type:
suffix = _extension_from_mime(mime_type)
encoded = "".join(encoded.split())
padding = len(encoded) % 4
if padding:
encoded = f"{encoded}{'=' * (4 - padding)}"
image_bytes = base64.b64decode(encoded)
with tempfile.NamedTemporaryFile(prefix="wechat-openai-image-", suffix=suffix, delete=False) as temp_file:
temp_file.write(image_bytes)
return temp_file.name
def _openai_images_from_response(response, output_format: str) -> list[str]:
outputs: list[str] = []
try:
for item in getattr(response, "data", []) or []:
b64_json = _openai_response_value(item, "b64_json")
if b64_json:
outputs.append(_write_openai_b64_image(str(b64_json), output_format))
continue
url = _openai_response_value(item, "url")
if url:
outputs.append(_rewrite_openai_image_url(str(url)))
except Exception:
_cleanup_openai_temp_files(outputs)
raise
return outputs
def _is_remote_image_url(value: str) -> bool:
return urllib.parse.urlparse(value).scheme in {"http", "https"}
def _send_image_outputs(client_port: str, from_wx_id: str, image_outputs: list[str]) -> None:
remote_urls = [value for value in image_outputs if value and _is_remote_image_url(value)]
local_paths = [value for value in image_outputs if value and not _is_remote_image_url(value)]
if remote_urls:
send_url = f"http://127.0.0.1:{client_port}/api/v1/robot/message/send/image/url"
send_body = {
"to_wxid": from_wx_id,
"image_urls": remote_urls,
}
response = _http_post_json(send_url, send_body, {"Content-Type": "application/json"}, timeout=300)
_debug_response("send image url response", response)
for file_path in local_paths:
send_url = f"http://127.0.0.1:{client_port}/api/v1/robot/message/send/image/local"
send_body = {
"to_wxid": from_wx_id,
"file_path": file_path,
}
response = _http_post_json(send_url, send_body, {"Content-Type": "application/json"}, timeout=300)
_debug_response("send image local response", response)
def _cleanup_openai_temp_files(image_outputs: list[str]) -> None:
for value in image_outputs:
path = Path(value)
if path.name.startswith("wechat-openai-image-") and path.is_file():
try:
path.unlink()
except OSError:
pass
def _extension_from_mime(mime_type: str) -> str:
if mime_type == "image/jpeg":
return ".jpg"
guessed = mimetypes.guess_extension(mime_type)
if guessed in {".png", ".jpg", ".jpeg", ".webp"}:
return guessed
return ".png"
def _download_openai_input_image(image: str, directory: str, index: int) -> Path:
stripped = image.strip()
if stripped.startswith("data:"):
header, encoded = stripped.split(",", 1)
mime_type = header[5:].split(";", 1)[0] or "image/png"
path = Path(directory) / f"input-{index}{_extension_from_mime(mime_type)}"
path.write_bytes(base64.b64decode(encoded))
return path
parsed = urllib.parse.urlparse(stripped)
if parsed.scheme in {"http", "https"}:
request = urllib.request.Request(stripped, headers={"User-Agent": "wechat-robot-skills/1.0"})
with urllib.request.urlopen(request, timeout=60) as response:
content_type = response.headers.get("Content-Type", "image/png").split(";", 1)[0].strip()
suffix = Path(parsed.path).suffix.lower()
if suffix not in {".png", ".jpg", ".jpeg", ".webp"}:
suffix = _extension_from_mime(content_type)
path = Path(directory) / f"input-{index}{suffix}"
path.write_bytes(response.read())
return path
path = Path(stripped).expanduser()
if path.is_file():
return path
raise RuntimeError(f"无法读取图片: {image}")
def call_jimeng(config: dict, prompt: str, model: str, images: list[str],
negative_prompt: str, ratio: str, resolution: str) -> list[str]:
"""Call JiMeng (即梦) image compositions API (图生图)."""
base_url = config.get("base_url", "").rstrip("/")
session_ids = config.get("sessionid", [])
if not base_url or not session_ids:
raise RuntimeError("即梦绘图配置缺少 base_url 或 sessionid")
if not model or model == "none":
model = "jimeng-5.0"
if not ratio:
ratio = "16:9"
if not resolution:
resolution = "2k"
# 如果分辨率大于4k重置为2k
m = re.search(r"(\d+)", resolution)
if m and int(m.group(1)) > 4:
resolution = "2k"
token = ",".join(session_ids)
body = {
"model": model,
"prompt": prompt,
"images": images,
"ratio": ratio,
"resolution": resolution,
"response_format": "url",
"sample_strength": 0.5,
}
if negative_prompt:
body["negative_prompt"] = negative_prompt
# 图生图使用 /v1/images/compositions 端点
resp = _http_post_json(
f"{base_url}/v1/images/compositions",
body,
{"Content-Type": "application/json", "Authorization": f"Bearer {token}"},
timeout=300,
)
urls = [item["url"] for item in resp.get("data", []) if item.get("url")]
return urls
def call_doubao(config: dict, prompt: str, model: str, image: str) -> list[str]:
"""Call DouBao (豆包) image-to-image API."""
api_key = config.get("api_key", "")
if not api_key:
raise RuntimeError("豆包绘图配置缺少 api_key")
if not model or model == "none":
model = "doubao-seededit-3.0-i2i"
model_map = {
"doubao-seededit-3.0-i2i": "doubao-seededit-3-0-i2i-250628",
}
actual_model = model_map.get(model, model)
body = {
"model": actual_model,
"prompt": prompt,
"response_format": "url",
"size": config.get("size", "2K"),
"sequential_image_generation": config.get("sequential_image_generation", "auto"),
"watermark": config.get("watermark", False),
}
if image:
body["image"] = image
resp = _http_post_json(
"https://ark.cn-beijing.volces.com/api/v3/images/generations",
body,
{"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
timeout=300,
)
urls = []
for item in resp.get("data", []):
url = item.get("url")
if url:
urls.append(url)
return urls
def call_zimage(config: dict, prompt: str, model: str, images: list[str]) -> list[str]:
"""Call Z-Image (造相) image generation API (async task-based)."""
base_url = config.get("base_url", "").rstrip("/")
api_key = config.get("api_key", "")
if not base_url or not api_key:
raise RuntimeError("造相绘图配置缺少 base_url 或 api_key")
if not model or model == "none":
model = "Qwen-Image-Edit-2511"
model_map = {
"Z-Image": "Tongyi-MAI/Z-Image",
"Z-Image-Turbo": "Tongyi-MAI/Z-Image-Turbo",
"Qwen-Image-Edit-2511": "Qwen/Qwen-Image-Edit-2511",
}
actual_model = model_map.get(model)
if actual_model is None:
raise RuntimeError(f"不支持的造相模型: {model}")
body = {
"model": actual_model,
"prompt": prompt,
"image_url": images,
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
"X-ModelScope-Async-Mode": "true",
}
# Step 1: create task
resp = _http_post_json(f"{base_url}/v1/images/generations", body, headers, timeout=30)
task_id = resp.get("task_id", "")
if not task_id:
raise RuntimeError("造相接口未返回 task_id")
# Step 2: poll for result
poll_headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
"X-ModelScope-Task-Type": "image_generation",
}
deadline = time.time() + 15 * 60 # 15 minutes
while time.time() < deadline:
task_resp = _http_get_json(f"{base_url}/v1/tasks/{task_id}", poll_headers, timeout=30)
status = task_resp.get("task_status", "")
if status == "SUCCEED":
images_result = task_resp.get("output_images", [])
if images_result:
return images_result
raise RuntimeError("造相任务成功但未返回图片")
if status == "FAILED":
raise RuntimeError("造相绘图任务失败")
time.sleep(5)
raise RuntimeError("造相绘图任务超时")
def call_openai(config: dict, prompt: str, model: str, images: list[str],
negative_prompt: str, ratio: str, resolution: str) -> list[str]:
"""Call OpenAI GPT Image API for image editing."""
client = _openai_client(config)
output_format = _openai_output_format(config)
quality = str(config.get("quality", "auto") or "auto")
background = str(config.get("background", "auto") or "auto")
if background == "transparent":
background = "auto"
with tempfile.TemporaryDirectory() as temp_dir:
input_paths = [
_download_openai_input_image(image, temp_dir, index)
for index, image in enumerate(images[:16], start=1)
]
input_files = [path.open("rb") for path in input_paths]
try:
kwargs = {
"model": model or "gpt-image-2",
"prompt": _openai_prompt(prompt, negative_prompt),
"image": input_files,
"n": _coerce_int(config.get("n"), 1, 1, 10),
"size": _openai_size(config, ratio, resolution),
"quality": quality,
"background": background,
"output_format": output_format,
}
if output_format in {"jpeg", "webp"} and config.get("output_compression") is not None:
kwargs["output_compression"] = _coerce_int(config.get("output_compression"), 100, 0, 100)
response = client.images.edit(**kwargs)
finally:
for input_file in input_files:
input_file.close()
_debug_response("openai images.edit response", response)
return _openai_images_from_response(response, output_format)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
JIMENG_MODELS = {"jimeng-4.5", "jimeng-4.6", "jimeng-4.7", "jimeng-5.0"}
DOUBAO_MODELS = {"doubao-seededit-3.0-i2i"}
ZIMAGE_MODELS = {"Z-Image", "Z-Image-Turbo", "Qwen-Image-Edit-2511"}
OPENAI_MODELS = {"gpt-image-2"}
def _parse_cli_params(argv: list[str]) -> dict:
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("--prompt", default="")
parser.add_argument("--images", action="append", default=[])
parser.add_argument("--model", default="")
parser.add_argument("--negative_prompt", default="")
parser.add_argument("--ratio", default="")
parser.add_argument("--resolution", default="")
namespace, unknown = parser.parse_known_args(argv)
if unknown:
raise ValueError(f"存在不支持的参数: {' '.join(unknown)}")
return {
"prompt": namespace.prompt,
"images": [img for img in namespace.images if img.strip()],
"model": namespace.model,
"negative_prompt": namespace.negative_prompt,
"ratio": namespace.ratio,
"resolution": namespace.resolution,
}
def main() -> int:
if len(sys.argv) < 2:
sys.stdout.write("缺少输入参数\n")
return 1
try:
params = _parse_cli_params(sys.argv[1:])
except ValueError as exc:
sys.stdout.write(f"参数格式错误: {exc}\n")
return 1
prompt = params.get("prompt", "").strip()
if not prompt:
sys.stdout.write("缺少提示词\n")
return 1
images = params.get("images", [])
if not images:
sys.stdout.write("图片链接列表为空\n")
return 1
model = params.get("model", "").strip()
negative_prompt = params.get("negative_prompt", "").strip()
ratio = params.get("ratio", "").strip()
resolution = params.get("resolution", "").strip()
from_wx_id = os.environ.get("ROBOT_FROM_WX_ID", "").strip()
if not from_wx_id:
sys.stdout.write("环境变量 ROBOT_FROM_WX_ID 未配置\n")
return 1
# Connect to DB and load settings
try:
conn = _mysql_connect()
except Exception as exc:
sys.stdout.write(f"数据库连接失败: {exc}\n")
return 1
try:
enabled, settings_json = load_drawing_settings(conn, from_wx_id)
except Exception as exc:
sys.stdout.write(f"加载绘图配置失败: {exc}\n")
return 1
finally:
try:
conn.close()
except Exception:
pass
if not enabled:
sys.stdout.write("AI 绘图未开启\n")
return 0
# Default model
if not model or model == "none":
model = "jimeng-5.0"
# Route to correct API
try:
image_urls: list[str] = []
if model in JIMENG_MODELS:
jimeng_config = settings_json.get("JiMeng", {})
if not jimeng_config.get("enabled", False):
sys.stdout.write("即梦绘图未开启\n")
return 0
image_urls = call_jimeng(jimeng_config, prompt, model, images, negative_prompt, ratio, resolution)
elif model in DOUBAO_MODELS:
doubao_config = settings_json.get("DouBao", {})
if not doubao_config.get("enabled", False):
sys.stdout.write("豆包绘图未开启\n")
return 0
# 豆包图生图只支持单张图片
image_urls = call_doubao(doubao_config, prompt, model, images[0])
elif model in ZIMAGE_MODELS:
zimage_config = settings_json.get("Z-Image", {})
if not zimage_config.get("enabled", False):
sys.stdout.write("造相绘图未开启\n")
return 0
image_urls = call_zimage(zimage_config, prompt, model, images)
elif model in OPENAI_MODELS:
openai_config = settings_json.get("OpenAI", {})
if not openai_config.get("enabled", False):
sys.stdout.write("OpenAI 绘图未开启\n")
return 0
image_urls = call_openai(openai_config, prompt, model, images, negative_prompt, ratio, resolution)
else:
sys.stdout.write("不支持的 AI 图像模型\n")
return 1
except Exception as exc:
sys.stdout.write(f"调用绘图接口失败: {exc}\n")
return 1
if not image_urls:
sys.stdout.write("未生成任何图像\n")
return 1
# 通过客户端接口发送图片
client_port = os.environ.get("ROBOT_WECHAT_CLIENT_PORT", "").strip()
if not client_port:
_cleanup_openai_temp_files(image_urls)
sys.stdout.write("环境变量 ROBOT_WECHAT_CLIENT_PORT 未配置\n")
return 1
try:
_send_image_outputs(client_port, from_wx_id, image_urls)
sys.stdout.write("图片发送成功\n")
except Exception as exc:
sys.stdout.write(f"发送图片失败: {exc}\n")
return 1
finally:
_cleanup_openai_temp_files(image_urls)
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,3 +0,0 @@
cryptography
openai>=2.34.0
pymysql>=1.1,<2

View File

@ -1,54 +0,0 @@
---
name: kfc
description: "当用户说「kfc」、「KFC」、「肯德基」或「肯德基文案」时触发。调用 KFC 文案接口,返回其中的文案内容。"
argument-hint: "无需参数,直接调用即可"
---
# KFC Skill
## 描述
这是一个用于获取肯德基疯狂星期四文案的技能。
当用户提到 `kfc`、`KFC`、`肯德基` 或 `肯德基文案` 时,调用接口获取最新文案,并把接口返回的文案直接回复给用户。
这个仓库里额外提供了一个可执行脚本 `scripts/kfc.py`,方便宿主机器人直接调用。
## 触发条件
- 用户说「kfc」
- 用户说「KFC」
- 用户说「肯德基」
- 用户说「肯德基文案」
## 接口信息
- 请求地址:`https://api.pearapi.ai/api/kfc?type=json`
- 请求方式:`GET`
- 本地脚本:`scripts/kfc.py`
- 返回示例:
```json
{
"code": 200,
"msg": "获取成功",
"text": "14看着不香果然还是13更香iPhone14真是更新了个寂寞......今天肯德基疯狂星期四,谁请我吃?",
"api_source": "官方API网:https://api.pearapi.ai/"
}
```
- 关键字段:`text`,表示需要返回给用户的肯德基文案内容。
## 执行步骤
1. 当用户输入 `kfc`、`KFC`、`肯德基` 或 `肯德基文案` 时触发该技能。
2. 在仓库根目录下执行本地脚本:`python3 scripts/kfc.py`。
3. 脚本内部发送 `GET` 请求到 `https://api.pearapi.ai/api/kfc?type=json`
4. 脚本解析返回的 JSON并输出 `text` 字段。
5. 如果接口请求失败、返回格式异常,或没有拿到 `text`,脚本输出:`今天的肯德基文案暂时没拿到,等我再去问问。`
6. 如果脚本无法执行Python 环境不可用),直接回复兜底文案:`今天的肯德基文案暂时没拿到,等我再去问问。`
## 回复要求
- 只返回接口中的 `text` 文案内容,不要额外添加解释。
- 当接口异常时,使用固定兜底文案回复。

View File

@ -1,46 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import json
import sys
import traceback
import urllib.error
import urllib.request
sys.stderr = sys.stdout
API_URL = "https://api.pearapi.ai/api/kfc?type=json"
FALLBACK_TEXT = "今天的肯德基文案暂时没拿到,等我再去问问。"
def fetch_kfc_copy() -> str:
try:
with urllib.request.urlopen(API_URL, timeout=10) as response:
payload = json.load(response)
except (urllib.error.URLError, TimeoutError, json.JSONDecodeError):
return FALLBACK_TEXT
text = payload.get("text")
if isinstance(text, str) and text.strip():
# 该 API 偶尔返回双重转义的换行符(字面量 \n在此统一还原
return "<wechat-robot-text>" + text.replace("\\n", "\n") + "</wechat-robot-text>"
return FALLBACK_TEXT
def main() -> int:
sys.stdout.write(fetch_kfc_copy())
sys.stdout.write("\n")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,22 +0,0 @@
---
name: ping
description: "示例技能。当用户说「使用示例技能」、「ping」或「调用示例」时触发返回 pong。"
argument-hint: "无需参数,直接调用即可"
---
# Ping Skill
## 描述
这是一个最简单的示例技能,用于演示 Agent Skills 的基本结构。
## 触发条件
- 用户说「使用示例技能」
- 用户说「ping」
- 用户说「调用示例」
## 执行步骤
1. 接收到用户调用请求
2. 直接回复:`pong`

View File

@ -1,99 +0,0 @@
---
name: text-to-image
description: "AI绘图工具当用户想通过文本生成图像时可以调用该工具。根据用户输入内容提取画图提示词选择合适的模型进行绘图返回生成的图片。"
argument-hint: "需要 prompt 参数(画图提示词),可选 model模型、negative_prompt反向提示词、ratio宽高比、resolution分辨率"
---
# Text To Image Skill
## 描述
这是一个 AI 文生图技能当用户想通过文本描述生成图像时触发。支持多个绘图模型即梦JiMeng、豆包DouBao、造相Z-Image、OpenAI GPT Image。
从数据库中读取绘图配置API 密钥、Base URL 等),根据用户选择的模型调用对应的绘图 API返回生成的图片 URL。
这个仓库里额外提供了一个可执行脚本 `scripts/text_to_image.py`,方便宿主机器人直接调用。
## 触发条件
- 用户想画图、生成图片
- 用户说「画一张……」「生成一张……的图片」「帮我画……」
- 用户提到「文生图」「AI绘图」「AI画图」
- 用户描述了想要生成的图片内容
## 参数说明JSON Schema
调用脚本时,需要通过 shell 风格参数传入,参数结构如下:
```json
{
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "根据用户输入内容,提取出的画图提示词,但是不要对提示词进行总结。"
},
"model": {
"type": "string",
"description": "画图模型选择可选即梦4.5(jimeng-4.5) / 即梦4.6(jimeng-4.6) / 即梦4.7(jimeng-4.7) / 即梦5.0(jimeng-5.0) / 豆包4.5(doubao-seedream-4.5) / 豆包4.0(doubao-seedream-4.0) / 豆包文生图(doubao-seedream-3.0-t2i) / 豆包图生图(doubao-seededit-3.0-i2i) / 造相基础版(Z-Image) / 造相蒸馏版(Z-Image-Turbo) / 造相图片编辑(Qwen-Image-Edit-2511) / OpenAI GPT Image(gpt-image-2),默认: 空(none)。",
"enum": [
"none",
"jimeng-4.5",
"jimeng-4.6",
"jimeng-4.7",
"jimeng-5.0",
"doubao-seedream-4.5",
"doubao-seedream-4.0",
"doubao-seedream-3.0-t2i",
"doubao-seededit-3.0-i2i",
"Z-Image",
"Z-Image-Turbo",
"Qwen-Image-Edit-2511",
"gpt-image-2"
],
"default": "none"
},
"negative_prompt": {
"type": "string",
"description": "用于描述图像中不希望出现的元素或特征的文本,可选。"
},
"ratio": {
"type": "string",
"description": "图像的宽高比可选默认16:9。",
"default": "16:9"
},
"resolution": {
"type": "string",
"description": "图像的分辨率可选默认2k。",
"default": "2k"
}
},
"required": ["prompt"],
"additionalProperties": false
}
```
对应的命令行参数为:
- `--prompt <画图提示词>` 必填
- `--model <模型名>` 可选
- `--negative_prompt <反向提示词>` 可选
- `--ratio <宽高比>` 可选
- `--resolution <分辨率>` 可选
## 依赖安装
- 脚本首次运行时会自动创建虚拟环境并安装依赖,无需手动执行。
- 如需手动重新安装,可执行:`python3 scripts/bootstrap.py`
## 执行步骤
1. 当用户想通过文本描述生成图像时触发该技能。
2. 从用户输入中提取 prompt画图提示词不对提示词做总结或修改。可选提取 model、negative_prompt、ratio、resolution 参数。
3. 将参数组装为 shell 风格命令行参数,在仓库根目录下执行本地脚本,例如:`python3 scripts/text_to_image.py --prompt '一只可爱的猫咪在花园里玩耍' --model jimeng-5.0`。
4. 脚本生成图片后会自动调用客户端接口 `POST http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/image/url` 将图片发送给用户,成功时输出「图片发送成功」。
## 回复要求
- 成功时,脚本输出「图片发送成功」,表示图片已通过客户端接口直接发送,无需 AI 智能体再做额外处理。
- 失败时,返回具体的失败信息。

View File

@ -1,133 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import hashlib
import subprocess
import sys
import traceback
from pathlib import Path
sys.stderr = sys.stdout
def _skill_root_from(script_dir: Path) -> Path:
return script_dir.parent
def _venv_dir(script_dir: Path) -> Path:
return _skill_root_from(script_dir) / ".venv"
def _venv_python(venv_dir: Path) -> Path:
if sys.platform == "win32":
return venv_dir / "Scripts" / "python.exe"
return venv_dir / "bin" / "python"
def _stamp_file(venv_dir: Path) -> Path:
return venv_dir / ".req_hash"
def _file_hash(path: Path) -> str:
return hashlib.sha256(path.read_bytes()).hexdigest()
def _deps_up_to_date(requirements_file: Path, venv_dir: Path) -> bool:
stamp = _stamp_file(venv_dir)
if not stamp.is_file():
return False
return stamp.read_text().strip() == _file_hash(requirements_file)
def _write_stamp(requirements_file: Path, venv_dir: Path) -> None:
_stamp_file(venv_dir).write_text(_file_hash(requirements_file))
def _ensure_venv(venv_dir: Path, venv_python: Path) -> int:
if venv_python.is_file():
return 0
sys.stdout.write(f"未检测到技能虚拟环境,正在创建: {venv_dir}\n")
import shutil
py = sys.executable or next(
(shutil.which(c) for c in ("python3", "python") if shutil.which(c)), None
)
if not py:
raise RuntimeError("无法找到 Python 解释器路径")
command = [
py,
"-m",
"venv",
str(venv_dir),
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"创建虚拟环境失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
return 0
def main() -> int:
script_dir = Path(__file__).resolve().parent
requirements_file = script_dir / "requirements.txt"
venv_dir = _venv_dir(script_dir)
venv_python = _venv_python(venv_dir)
if not requirements_file.is_file():
sys.stdout.write(f"未找到依赖文件: {requirements_file}\n")
return 1
ensure_result = _ensure_venv(venv_dir, venv_python)
if ensure_result != 0:
return ensure_result
if _deps_up_to_date(requirements_file, venv_dir):
sys.stdout.write("依赖已是最新,跳过安装\n")
return 0
command = [
str(venv_python),
"-m",
"pip",
"install",
"--upgrade",
"pip",
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"升级 pip 失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
command = [
str(venv_python),
"-m",
"pip",
"install",
"-r",
str(requirements_file),
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"安装依赖失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
_write_stamp(requirements_file, venv_dir)
sys.stdout.write(f"依赖安装完成,当前技能虚拟环境: {venv_dir}\n")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,3 +0,0 @@
cryptography
openai>=2.34.0
pymysql>=1.1,<2

View File

@ -1,713 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import base64
import json
import mimetypes
import os
import re
import subprocess
import sys
import tempfile
import time
import traceback
import urllib.parse
import urllib.request
from pathlib import Path
# The skill runner consumes stdout, so route Python error output there as well.
sys.stderr = sys.stdout
def _skill_root() -> Path:
script_dir = Path(__file__).resolve().parent
return script_dir.parent
def _skill_venv_python() -> Path:
venv_dir = _skill_root() / ".venv"
if sys.platform == "win32":
return venv_dir / "Scripts" / "python.exe"
return venv_dir / "bin" / "python"
def _get_python_executable() -> str:
if sys.executable:
return sys.executable
import shutil
for candidate in ("python3", "python"):
found = shutil.which(candidate)
if found:
return found
raise RuntimeError("无法找到 Python 解释器路径")
def _run_bootstrap() -> None:
bootstrap = Path(__file__).resolve().parent / "bootstrap.py"
result = subprocess.run([_get_python_executable(), str(bootstrap)])
if result.returncode != 0:
raise SystemExit(result.returncode)
def _ensure_skill_venv_python() -> None:
venv_python = _skill_venv_python()
if not venv_python.is_file():
_run_bootstrap()
venv_python = _skill_venv_python()
if not venv_python.is_file():
sys.stdout.write("bootstrap 后仍未找到虚拟环境\n")
raise SystemExit(1)
venv_dir = _skill_root() / ".venv"
if Path(sys.prefix) == venv_dir.resolve():
return
os.execv(str(venv_python), [str(venv_python), str(Path(__file__).resolve()), *sys.argv[1:]])
_ensure_skill_venv_python()
try:
import pymysql # type: ignore # noqa: E402
from openai import OpenAI # type: ignore # noqa: E402
except ModuleNotFoundError:
_run_bootstrap()
_py = _get_python_executable()
os.execv(_py, [_py, str(Path(__file__).resolve()), *sys.argv[1:]])
# ---------------------------------------------------------------------------
# Database helpers
# ---------------------------------------------------------------------------
def _mysql_connect():
host = os.environ.get("MYSQL_HOST", "127.0.0.1")
port = int(os.environ.get("MYSQL_PORT", "3306"))
user = os.environ.get("MYSQL_USER", "root")
password = os.environ.get("MYSQL_PASSWORD", "")
database = os.environ.get("ROBOT_CODE", "")
if not database:
raise RuntimeError("环境变量 ROBOT_CODE 未配置")
return pymysql.connect(
host=host, port=port, user=user, password=password,
database=database, charset="utf8mb4",
connect_timeout=10, read_timeout=30,
)
def _query_one(conn, sql: str, params: tuple = ()) -> dict | None:
cur = conn.cursor()
cur.execute(sql, params)
columns = [desc[0] for desc in cur.description] if cur.description else []
row = cur.fetchone()
cur.close()
if row is None:
return None
return dict(zip(columns, row))
# ---------------------------------------------------------------------------
# Settings resolution (mirrors the Go service logic)
# ---------------------------------------------------------------------------
def load_drawing_settings(conn, from_wx_id: str) -> tuple[bool, dict]:
"""Return (enabled, image_ai_settings_dict)."""
# 1. global_settings
gs = _query_one(conn, "SELECT image_ai_enabled, image_ai_settings FROM global_settings LIMIT 1")
enabled = False
settings_json: dict = {}
if gs:
if gs.get("image_ai_enabled"):
enabled = bool(gs["image_ai_enabled"])
raw = gs.get("image_ai_settings")
if raw:
if isinstance(raw, (bytes, bytearray)):
raw = raw.decode("utf-8")
if isinstance(raw, str) and raw.strip():
settings_json = json.loads(raw)
# 2. override from chatroom / friend settings
if from_wx_id.endswith("@chatroom"):
override = _query_one(
conn,
"SELECT image_ai_enabled, image_ai_settings FROM chat_room_settings WHERE chat_room_id = %s LIMIT 1",
(from_wx_id,),
)
else:
override = _query_one(
conn,
"SELECT image_ai_enabled, image_ai_settings FROM friend_settings WHERE wechat_id = %s LIMIT 1",
(from_wx_id,),
)
if override:
if override.get("image_ai_enabled") is not None:
enabled = bool(override["image_ai_enabled"])
raw = override.get("image_ai_settings")
if raw:
if isinstance(raw, (bytes, bytearray)):
raw = raw.decode("utf-8")
if isinstance(raw, str) and raw.strip():
settings_json = json.loads(raw)
return enabled, settings_json
# ---------------------------------------------------------------------------
# API callers
# ---------------------------------------------------------------------------
def _http_post_json(url: str, body: dict, headers: dict, timeout: int = 300) -> dict:
data = json.dumps(body).encode("utf-8")
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode("utf-8"))
def _http_get_json(url: str, headers: dict, timeout: int = 30) -> dict:
req = urllib.request.Request(url, headers=headers, method="GET")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode("utf-8"))
def _coerce_int(value, default: int, minimum: int, maximum: int) -> int:
try:
parsed = int(value)
except (TypeError, ValueError):
parsed = default
return min(max(parsed, minimum), maximum)
def _openai_output_format(config: dict) -> str:
output_format = str(config.get("output_format", "png") or "png").lower()
if output_format not in {"png", "jpeg", "webp"}:
return "png"
return output_format
def _openai_size(config: dict, ratio: str, resolution: str) -> str:
configured = str(config.get("size", "") or "").strip()
if configured:
return configured
normalized_ratio = (ratio or "").replace(" ", "").lower()
normalized_resolution = (resolution or "").replace(" ", "").lower()
if normalized_resolution in {"4k", "2160p", "3840x2160"}:
sizes = {
"16:9": "3840x2160",
"9:16": "2160x3840",
"1:1": "2048x2048",
"3:2": "3072x2048",
"2:3": "2048x3072",
}
elif normalized_resolution in {"2k", "1440p", "2048"}:
sizes = {
"16:9": "2048x1152",
"9:16": "1152x2048",
"1:1": "2048x2048",
"3:2": "2048x1360",
"2:3": "1360x2048",
}
elif normalized_resolution in {"1k", "1024", "1024p"}:
sizes = {
"16:9": "1536x864",
"9:16": "864x1536",
"1:1": "1024x1024",
"3:2": "1536x1024",
"2:3": "1024x1536",
}
else:
return "auto"
return sizes.get(normalized_ratio, "auto")
def _openai_prompt(prompt: str, negative_prompt: str) -> str:
if not negative_prompt:
return prompt
return f"{prompt}\n\n不要包含: {negative_prompt}"
def _openai_client(config: dict) -> OpenAI:
api_key = str(config.get("api_key", "")).strip()
if not api_key:
raise RuntimeError("OpenAI 绘图配置缺少 api_key")
base_url = str(config.get("base_url", "") or "").strip()
organization = str(config.get("organization", "") or "").strip()
project = str(config.get("project", "") or "").strip()
timeout: float | None = None
timeout_value = config.get("timeout")
if timeout_value not in (None, ""):
timeout = float(timeout_value)
return OpenAI(
api_key=api_key,
base_url=base_url or None,
organization=organization or None,
project=project or None,
timeout=timeout,
)
def _truncate_debug_payload(value):
if isinstance(value, dict):
return {
key: (
f"{item[:50]}..." if key == "b64_json" and isinstance(item, str) and len(item) > 50 else _truncate_debug_payload(item)
)
for key, item in value.items()
}
if isinstance(value, list):
return [_truncate_debug_payload(item) for item in value]
return value
def _debug_response(label: str, payload) -> None:
if hasattr(payload, "model_dump"):
payload = payload.model_dump()
payload = _truncate_debug_payload(payload)
sys.stdout.write(f"[debug] {label}: {json.dumps(payload, ensure_ascii=False)}\n")
def _rewrite_openai_image_url(url: str) -> str:
internal_host = "http://chatgpt2api:80"
external_host = "https://chatgpt2api.houhoukang.com"
if url.startswith(internal_host):
return f"{external_host}{url[len(internal_host):]}"
return url
def _extension_from_mime(mime_type: str) -> str:
if mime_type == "image/jpeg":
return ".jpg"
guessed = mimetypes.guess_extension(mime_type)
if guessed in {".png", ".jpg", ".jpeg", ".webp"}:
return guessed
return ".png"
def _extension_from_output_format(output_format: str) -> str:
if output_format == "jpeg":
return ".jpg"
if output_format == "webp":
return ".webp"
return ".png"
def _openai_response_value(item, key: str):
if isinstance(item, dict):
return item.get(key)
return getattr(item, key, None)
def _write_openai_b64_image(b64_json: str, output_format: str) -> str:
encoded = b64_json.strip()
suffix = _extension_from_output_format(output_format)
if encoded.startswith("data:"):
header, encoded = encoded.split(",", 1)
mime_type = header[5:].split(";", 1)[0].strip().lower()
if mime_type:
suffix = _extension_from_mime(mime_type)
encoded = "".join(encoded.split())
padding = len(encoded) % 4
if padding:
encoded = f"{encoded}{'=' * (4 - padding)}"
image_bytes = base64.b64decode(encoded)
with tempfile.NamedTemporaryFile(prefix="wechat-openai-image-", suffix=suffix, delete=False) as temp_file:
temp_file.write(image_bytes)
return temp_file.name
def _openai_images_from_response(response, output_format: str) -> list[str]:
outputs: list[str] = []
try:
for item in getattr(response, "data", []) or []:
b64_json = _openai_response_value(item, "b64_json")
if b64_json:
outputs.append(_write_openai_b64_image(str(b64_json), output_format))
continue
url = _openai_response_value(item, "url")
if url:
outputs.append(_rewrite_openai_image_url(str(url)))
except Exception:
_cleanup_openai_temp_files(outputs)
raise
return outputs
def _is_remote_image_url(value: str) -> bool:
return urllib.parse.urlparse(value).scheme in {"http", "https"}
def _send_image_outputs(client_port: str, from_wx_id: str, image_outputs: list[str]) -> None:
remote_urls = [value for value in image_outputs if value and _is_remote_image_url(value)]
local_paths = [value for value in image_outputs if value and not _is_remote_image_url(value)]
if remote_urls:
send_url = f"http://127.0.0.1:{client_port}/api/v1/robot/message/send/image/url"
send_body = {
"to_wxid": from_wx_id,
"image_urls": remote_urls,
}
response = _http_post_json(send_url, send_body, {"Content-Type": "application/json"}, timeout=300)
_debug_response("send image url response", response)
for file_path in local_paths:
send_url = f"http://127.0.0.1:{client_port}/api/v1/robot/message/send/image/local"
send_body = {
"to_wxid": from_wx_id,
"file_path": file_path,
}
response = _http_post_json(send_url, send_body, {"Content-Type": "application/json"}, timeout=300)
_debug_response("send image local response", response)
def _cleanup_openai_temp_files(image_outputs: list[str]) -> None:
for value in image_outputs:
path = Path(value)
if path.name.startswith("wechat-openai-image-") and path.is_file():
try:
path.unlink()
except OSError:
pass
def call_jimeng(config: dict, prompt: str, model: str,
negative_prompt: str, ratio: str, resolution: str) -> list[str]:
"""Call JiMeng (即梦) image generation API."""
base_url = config.get("base_url", "").rstrip("/")
session_ids = config.get("sessionid", [])
if not base_url or not session_ids:
raise RuntimeError("即梦绘图配置缺少 base_url 或 sessionid")
if not model or model == "none":
model = "jimeng-5.0"
if not ratio:
ratio = "16:9"
if not resolution:
resolution = "2k"
# 如果分辨率大于4k重置为2k
m = re.search(r"(\d+)", resolution)
if m and int(m.group(1)) > 4:
resolution = "2k"
token = ",".join(session_ids)
body = {
"model": model,
"prompt": prompt,
"ratio": ratio,
"resolution": resolution,
"response_format": "url",
"sample_strength": 0.5,
}
if negative_prompt:
body["negative_prompt"] = negative_prompt
resp = _http_post_json(
f"{base_url}/v1/images/generations",
body,
{"Content-Type": "application/json", "Authorization": f"Bearer {token}"},
timeout=300,
)
urls = [item["url"] for item in resp.get("data", []) if item.get("url")]
return urls
def call_doubao(config: dict, prompt: str, model: str) -> list[str]:
"""Call DouBao (豆包) image generation API."""
api_key = config.get("api_key", "")
if not api_key:
raise RuntimeError("豆包绘图配置缺少 api_key")
if not model or model == "none":
model = "doubao-seedream-4.5"
# Map friendly model names to actual endpoint model IDs
model_map = {
"doubao-seedream-4.5": "doubao-seedream-4-5-251128",
"doubao-seedream-4.0": "doubao-seedream-4-0-251128",
"doubao-seedream-3.0-t2i": "doubao-seedream-3-0-t2i-250415",
"doubao-seededit-3.0-i2i": "doubao-seededit-3-0-i2i-250628",
}
actual_model = model_map.get(model, model)
body = {
"model": actual_model,
"prompt": prompt,
"response_format": "url",
"size": config.get("size", "2K"),
"sequential_image_generation": config.get("sequential_image_generation", "auto"),
"watermark": config.get("watermark", False),
}
image_val = config.get("image", "")
if image_val:
body["image"] = image_val
resp = _http_post_json(
"https://ark.cn-beijing.volces.com/api/v3/images/generations",
body,
{"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
timeout=300,
)
urls = []
for item in resp.get("data", []):
url = item.get("url")
if url:
urls.append(url)
return urls
def call_zimage(config: dict, prompt: str, model: str) -> list[str]:
"""Call Z-Image (造相) image generation API (async task-based)."""
base_url = config.get("base_url", "").rstrip("/")
api_key = config.get("api_key", "")
if not base_url or not api_key:
raise RuntimeError("造相绘图配置缺少 base_url 或 api_key")
if not model or model == "none":
model = "Z-Image-Turbo"
# Map model names
model_map = {
"Z-Image": "Tongyi-MAI/Z-Image",
"Z-Image-Turbo": "Tongyi-MAI/Z-Image-Turbo",
"Qwen-Image-Edit-2511": "Qwen/Qwen-Image-Edit-2511",
}
actual_model = model_map.get(model)
if actual_model is None:
raise RuntimeError(f"不支持的造相模型: {model}")
body = {
"model": actual_model,
"prompt": prompt,
"image_url": config.get("image_url", []),
}
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
"X-ModelScope-Async-Mode": "true",
}
# Step 1: create task
resp = _http_post_json(f"{base_url}/v1/images/generations", body, headers, timeout=30)
task_id = resp.get("task_id", "")
if not task_id:
raise RuntimeError("造相接口未返回 task_id")
# Step 2: poll for result
poll_headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
"X-ModelScope-Task-Type": "image_generation",
}
deadline = time.time() + 15 * 60 # 15 minutes
while time.time() < deadline:
task_resp = _http_get_json(f"{base_url}/v1/tasks/{task_id}", poll_headers, timeout=30)
status = task_resp.get("task_status", "")
if status == "SUCCEED":
images = task_resp.get("output_images", [])
if images:
return images
raise RuntimeError("造相任务成功但未返回图片")
if status == "FAILED":
raise RuntimeError("造相绘图任务失败")
time.sleep(5)
raise RuntimeError("造相绘图任务超时")
def call_openai(config: dict, prompt: str, model: str,
negative_prompt: str, ratio: str, resolution: str) -> list[str]:
"""Call OpenAI GPT Image API for text-to-image generation."""
client = _openai_client(config)
output_format = _openai_output_format(config)
quality = str(config.get("quality", "auto") or "auto")
moderation = str(config.get("moderation", "auto") or "auto")
background = str(config.get("background", "auto") or "auto")
if background == "transparent":
background = "auto"
kwargs = {
"model": model or "gpt-image-2",
"prompt": _openai_prompt(prompt, negative_prompt),
"n": _coerce_int(config.get("n"), 1, 1, 10),
"size": _openai_size(config, ratio, resolution),
"quality": quality,
"background": background,
"moderation": moderation,
"output_format": output_format,
}
if output_format in {"jpeg", "webp"} and config.get("output_compression") is not None:
kwargs["output_compression"] = _coerce_int(config.get("output_compression"), 100, 0, 100)
response = client.images.generate(**kwargs)
_debug_response("openai images.generate response", response)
return _openai_images_from_response(response, output_format)
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
JIMENG_MODELS = {"jimeng-4.5", "jimeng-4.6", "jimeng-4.7", "jimeng-5.0"}
DOUBAO_MODELS = {"doubao-seedream-4.5", "doubao-seedream-4.0", "doubao-seedream-3.0-t2i", "doubao-seededit-3.0-i2i"}
ZIMAGE_MODELS = {"Z-Image", "Z-Image-Turbo", "Qwen-Image-Edit-2511"}
OPENAI_MODELS = {"gpt-image-2"}
def _parse_cli_params(argv: list[str]) -> dict[str, str]:
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("--prompt", default="")
parser.add_argument("--model", default="")
parser.add_argument("--negative_prompt", default="")
parser.add_argument("--ratio", default="")
parser.add_argument("--resolution", default="")
namespace, unknown = parser.parse_known_args(argv)
if unknown:
raise ValueError(f"存在不支持的参数: {' '.join(unknown)}")
return {
"prompt": namespace.prompt,
"model": namespace.model,
"negative_prompt": namespace.negative_prompt,
"ratio": namespace.ratio,
"resolution": namespace.resolution,
}
def main() -> int:
if len(sys.argv) < 2:
sys.stdout.write("缺少输入参数\n")
return 1
try:
params = _parse_cli_params(sys.argv[1:])
except ValueError as exc:
sys.stdout.write(f"参数格式错误: {exc}\n")
return 1
prompt = params.get("prompt", "").strip()
if not prompt:
sys.stdout.write("缺少画图提示词\n")
return 1
model = params.get("model", "").strip()
negative_prompt = params.get("negative_prompt", "").strip()
ratio = params.get("ratio", "").strip()
resolution = params.get("resolution", "").strip()
from_wx_id = os.environ.get("ROBOT_FROM_WX_ID", "").strip()
if not from_wx_id:
sys.stdout.write("环境变量 ROBOT_FROM_WX_ID 未配置\n")
return 1
# Connect to DB and load settings
try:
conn = _mysql_connect()
except Exception as exc:
sys.stdout.write(f"数据库连接失败: {exc}\n")
return 1
try:
enabled, settings_json = load_drawing_settings(conn, from_wx_id)
except Exception as exc:
conn.close()
sys.stdout.write(f"加载绘图配置失败: {exc}\n")
return 1
finally:
try:
conn.close()
except Exception:
pass
if not enabled:
sys.stdout.write("AI 绘图未开启\n")
return 0
# Default model
if not model or model == "none":
model = "jimeng-5.0"
# Route to correct API
try:
image_urls: list[str] = []
if model in JIMENG_MODELS:
jimeng_config = settings_json.get("JiMeng", {})
if not jimeng_config.get("enabled", False):
sys.stdout.write("即梦绘图未开启\n")
return 0
image_urls = call_jimeng(jimeng_config, prompt, model, negative_prompt, ratio, resolution)
elif model in DOUBAO_MODELS:
doubao_config = settings_json.get("DouBao", {})
if not doubao_config.get("enabled", False):
sys.stdout.write("豆包绘图未开启\n")
return 0
image_urls = call_doubao(doubao_config, prompt, model)
elif model in ZIMAGE_MODELS:
zimage_config = settings_json.get("Z-Image", {})
if not zimage_config.get("enabled", False):
sys.stdout.write("造相绘图未开启\n")
return 0
image_urls = call_zimage(zimage_config, prompt, model)
elif model in OPENAI_MODELS:
openai_config = settings_json.get("OpenAI", {})
if not openai_config.get("enabled", False):
sys.stdout.write("OpenAI 绘图未开启\n")
return 0
image_urls = call_openai(openai_config, prompt, model, negative_prompt, ratio, resolution)
else:
sys.stdout.write("不支持的 AI 图像模型\n")
return 1
except Exception as exc:
sys.stdout.write(f"调用绘图接口失败: {exc}\n")
return 1
if not image_urls:
sys.stdout.write("未生成任何图像\n")
return 1
# 通过客户端接口发送图片
client_port = os.environ.get("ROBOT_WECHAT_CLIENT_PORT", "").strip()
if not client_port:
_cleanup_openai_temp_files(image_urls)
sys.stdout.write("环境变量 ROBOT_WECHAT_CLIENT_PORT 未配置\n")
return 1
try:
_send_image_outputs(client_port, from_wx_id, image_urls)
sys.stdout.write("图片发送成功\n")
except Exception as exc:
sys.stdout.write(f"发送图片失败: {exc}\n")
return 1
finally:
_cleanup_openai_temp_files(image_urls)
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,116 +0,0 @@
---
name: video-generation
description: "AI 视频生成工具。当用户想生成视频、文生视频、图生视频、让图片动起来、指定首帧尾帧生成视频时使用。支持纯文本生成视频,或使用 1 张图片作为首帧、2 张图片作为首帧和尾帧。"
argument-hint: "需要 prompt可选 model、file_paths、ratio、resolution、duration。file_paths 最多 2 个。"
---
# Video Generation Skill
## 描述
这是一个 AI 视频生成技能,覆盖两类常见场景:
- 文生视频:用户只提供文本描述。
- 图生视频:用户提供 1 张首帧图,或 2 张首尾帧图,再结合提示词生成视频。
当前实现对接即梦视频接口,从数据库中的绘图配置读取 `base_url`、`sessionid` 等信息。脚本生成成功后会直接调用机器人客户端接口发送视频,不再输出固定的 XML 视频标签。
## 触发条件
- 用户想生成视频、做一段短视频、让画面动起来。
- 用户说「生成一个视频」「做个视频」「把这张图做成视频」「首帧是这张图」「尾帧用这张图」。
- 用户提到「文生视频」「图生视频」「首帧尾帧视频」「AI 视频生成」。
## 入参规范
```json
{
"type": "object",
"properties": {
"prompt": {
"type": "string",
"description": "根据用户输入的文本内容,提取出生成视频的提示词,但是不要对提示词进行修改。"
},
"model": {
"type": "string",
"description": "视频模型选择,可选,默认 none。",
"enum": [
"none",
"jimeng-video-seedance-2.0",
"jimeng-video-3.5-pro",
"jimeng-video-veo3",
"jimeng-video-veo3.1",
"jimeng-video-sora2",
"jimeng-video-3.0-pro",
"jimeng-video-3.0",
"jimeng-video-3.0-fast"
],
"default": "none"
},
"file_paths": {
"type": "array",
"items": {
"type": "string"
},
"description": "用于视频首尾帧的图片地址列表可选。0 个表示文生视频1 个表示首帧图生视频2 个表示首尾帧图生视频。最多 2 个。"
},
"ratio": {
"type": "string",
"description": "视频比例,可选,默认 4:3。",
"default": "4:3"
},
"resolution": {
"type": "string",
"description": "视频分辨率,可选,默认 720p。",
"default": "720p"
},
"duration": {
"type": "integer",
"description": "视频时长,单位秒,可选,默认 5。",
"default": 5
}
},
"required": ["prompt"],
"additionalProperties": false
}
```
对应的命令行参数为:
- `--prompt <提示词>` 必填
- `--model <模型名>` 可选
- `--file_paths <图片地址>` 可选,可重复传入 0 到 2 次
- `--ratio <比例>` 可选
- `--resolution <分辨率>` 可选
- `--duration <秒数>` 可选
## 依赖安装
- 脚本首次运行时会自动创建虚拟环境并安装依赖,无需手动执行。
- 如需手动重新安装,可执行:`python3 scripts/bootstrap.py`
## 执行步骤
1. 当用户想生成视频时触发该技能。
2. 从用户输入中提取 `prompt`,不要改写提示词本身。
3. 根据上下文可选提取 `model`、`file_paths`、`ratio`、`resolution`、`duration`。
4. 如果用户没有明确指定模型,默认使用 `jimeng-video-3.0-fast`
5. 在仓库根目录执行脚本,例如:
```bash
python3 scripts/video_generation.py --prompt '海边日落,镜头缓慢推进' --file_paths 'https://example.com/start.jpg'
```
6. 脚本生成视频后会自动调用客户端接口 `POST http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/video/url` 将视频发送给用户成功时输出「ended」。
## 校验规则
- `prompt` 不能为空。
- `file_paths` 最多只能有 2 个。
- 目前只支持即梦视频模型。
- 若数据库里关闭了 AI 绘图能力或即梦配置不可用,脚本会直接返回明确错误。
## 回复要求
- 成功时脚本输出「ended」表示视频已通过客户端接口直接发送无需 AI 智能体再做额外处理。
- 失败时,返回脚本输出的具体错误信息。

View File

@ -1,134 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import hashlib
import subprocess
import sys
import traceback
from pathlib import Path
sys.stderr = sys.stdout
def _skill_root_from(script_dir: Path) -> Path:
return script_dir.parent
def _venv_dir(script_dir: Path) -> Path:
return _skill_root_from(script_dir) / ".venv"
def _venv_python(venv_dir: Path) -> Path:
if sys.platform == "win32":
return venv_dir / "Scripts" / "python.exe"
return venv_dir / "bin" / "python"
def _stamp_file(venv_dir: Path) -> Path:
return venv_dir / ".req_hash"
def _file_hash(path: Path) -> str:
return hashlib.sha256(path.read_bytes()).hexdigest()
def _deps_up_to_date(requirements_file: Path, venv_dir: Path) -> bool:
stamp = _stamp_file(venv_dir)
if not stamp.is_file():
return False
return stamp.read_text().strip() == _file_hash(requirements_file)
def _write_stamp(requirements_file: Path, venv_dir: Path) -> None:
_stamp_file(venv_dir).write_text(_file_hash(requirements_file))
def _ensure_venv(venv_dir: Path, venv_python: Path) -> int:
if venv_python.is_file():
return 0
sys.stdout.write(f"未检测到技能虚拟环境,正在创建: {venv_dir}\n")
import shutil
py = sys.executable or next(
(shutil.which(c) for c in ("python3", "python") if shutil.which(c)), None
)
if not py:
raise RuntimeError("无法找到 Python 解释器路径")
command = [
py,
"-m",
"venv",
str(venv_dir),
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"创建虚拟环境失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
return 0
def main() -> int:
script_dir = Path(__file__).resolve().parent
requirements_file = script_dir / "requirements.txt"
venv_dir = _venv_dir(script_dir)
venv_python = _venv_python(venv_dir)
if not requirements_file.is_file():
sys.stdout.write(f"未找到依赖文件: {requirements_file}\n")
return 1
ensure_result = _ensure_venv(venv_dir, venv_python)
if ensure_result != 0:
return ensure_result
if _deps_up_to_date(requirements_file, venv_dir):
sys.stdout.write("依赖已是最新,跳过安装\n")
return 0
command = [
str(venv_python),
"-m",
"pip",
"install",
"--upgrade",
"pip",
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"升级 pip 失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
command = [
str(venv_python),
"-m",
"pip",
"install",
"-r",
str(requirements_file),
]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"安装依赖失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
_write_stamp(requirements_file, venv_dir)
sys.stdout.write(f"依赖安装完成,当前技能虚拟环境: {venv_dir}\n")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,2 +0,0 @@
cryptography
pymysql

View File

@ -1,370 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import json
import os
import subprocess
import sys
import traceback
import urllib.request
from pathlib import Path
sys.stderr = sys.stdout
SUPPORTED_MODELS = {
"jimeng-video-seedance-2.0",
"jimeng-video-3.5-pro",
"jimeng-video-veo3",
"jimeng-video-veo3.1",
"jimeng-video-sora2",
"jimeng-video-3.0-pro",
"jimeng-video-3.0",
"jimeng-video-3.0-fast",
}
DEFAULT_MODEL = "jimeng-video-3.0-fast"
DEFAULT_RATIO = "4:3"
DEFAULT_RESOLUTION = "720p"
DEFAULT_DURATION = 5
def _skill_root() -> Path:
script_dir = Path(__file__).resolve().parent
return script_dir.parent
def _skill_venv_python() -> Path:
venv_dir = _skill_root() / ".venv"
if sys.platform == "win32":
return venv_dir / "Scripts" / "python.exe"
return venv_dir / "bin" / "python"
def _get_python_executable() -> str:
if sys.executable:
return sys.executable
import shutil
for candidate in ("python3", "python"):
found = shutil.which(candidate)
if found:
return found
raise RuntimeError("无法找到 Python 解释器路径")
def _run_bootstrap() -> None:
bootstrap = Path(__file__).resolve().parent / "bootstrap.py"
result = subprocess.run([_get_python_executable(), str(bootstrap)])
if result.returncode != 0:
raise SystemExit(result.returncode)
def _ensure_skill_venv_python() -> None:
venv_python = _skill_venv_python()
if not venv_python.is_file():
_run_bootstrap()
venv_python = _skill_venv_python()
if not venv_python.is_file():
sys.stdout.write("bootstrap 后仍未找到虚拟环境\n")
raise SystemExit(1)
venv_dir = _skill_root() / ".venv"
if Path(sys.prefix) == venv_dir.resolve():
return
os.execv(str(venv_python), [str(venv_python), str(Path(__file__).resolve()), *sys.argv[1:]])
_ensure_skill_venv_python()
try:
import pymysql # type: ignore # noqa: E402
except ModuleNotFoundError:
_run_bootstrap()
_py = _get_python_executable()
os.execv(_py, [_py, str(Path(__file__).resolve()), *sys.argv[1:]])
def _mysql_connect():
host = os.environ.get("MYSQL_HOST", "127.0.0.1")
port = int(os.environ.get("MYSQL_PORT", "3306"))
user = os.environ.get("MYSQL_USER", "root")
password = os.environ.get("MYSQL_PASSWORD", "")
database = os.environ.get("ROBOT_CODE", "")
if not database:
raise RuntimeError("环境变量 ROBOT_CODE 未配置")
return pymysql.connect(
host=host,
port=port,
user=user,
password=password,
database=database,
charset="utf8mb4",
connect_timeout=10,
read_timeout=30,
)
def _query_one(conn, sql: str, params: tuple = ()) -> dict | None:
cur = conn.cursor()
cur.execute(sql, params)
columns = [desc[0] for desc in cur.description] if cur.description else []
row = cur.fetchone()
cur.close()
if row is None:
return None
return dict(zip(columns, row))
def load_drawing_settings(conn, from_wx_id: str) -> tuple[bool, dict]:
gs = _query_one(conn, "SELECT image_ai_enabled, image_ai_settings FROM global_settings LIMIT 1")
enabled = False
settings_json: dict = {}
if gs:
if gs.get("image_ai_enabled") is not None:
enabled = bool(gs["image_ai_enabled"])
raw = gs.get("image_ai_settings")
if raw:
if isinstance(raw, (bytes, bytearray)):
raw = raw.decode("utf-8")
if isinstance(raw, str) and raw.strip():
settings_json = json.loads(raw)
if from_wx_id.endswith("@chatroom"):
override = _query_one(
conn,
"SELECT image_ai_enabled, image_ai_settings FROM chat_room_settings WHERE chat_room_id = %s LIMIT 1",
(from_wx_id,),
)
else:
override = _query_one(
conn,
"SELECT image_ai_enabled, image_ai_settings FROM friend_settings WHERE wechat_id = %s LIMIT 1",
(from_wx_id,),
)
if override:
if override.get("image_ai_enabled") is not None:
enabled = bool(override["image_ai_enabled"])
raw = override.get("image_ai_settings")
if raw:
if isinstance(raw, (bytes, bytearray)):
raw = raw.decode("utf-8")
if isinstance(raw, str) and raw.strip():
settings_json = json.loads(raw)
return enabled, settings_json
def _resolve_jimeng_config(settings_json: dict) -> dict:
jimeng_config = settings_json.get("JiMeng")
if isinstance(jimeng_config, dict) and jimeng_config:
return jimeng_config
if isinstance(settings_json, dict):
return settings_json
return {}
def _normalize_session_ids(raw: object) -> list[str]:
if isinstance(raw, str):
return [raw] if raw.strip() else []
if isinstance(raw, list):
return [item.strip() for item in raw if isinstance(item, str) and item.strip()]
return []
def _http_post_json(url: str, body: dict, headers: dict, timeout: int = 300) -> dict:
data = json.dumps(body).encode("utf-8")
req = urllib.request.Request(url, data=data, headers=headers, method="POST")
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode("utf-8"))
def send_videos(from_wx_id: str, video_urls: list[str]) -> None:
client_port = os.environ.get("ROBOT_WECHAT_CLIENT_PORT", "").strip()
if not client_port:
raise RuntimeError("环境变量 ROBOT_WECHAT_CLIENT_PORT 未配置")
send_url = f"http://127.0.0.1:{client_port}/api/v1/robot/message/send/video/url"
send_body = {
"to_wxid": from_wx_id,
"video_urls": [url for url in video_urls if url],
}
_http_post_json(send_url, send_body, {"Content-Type": "application/json"}, timeout=60)
def call_jimeng_video(
config: dict,
prompt: str,
model: str,
file_paths: list[str],
ratio: str,
resolution: str,
duration: int,
) -> list[str]:
base_url = str(config.get("base_url", "")).rstrip("/")
session_ids = _normalize_session_ids(config.get("sessionid", []))
if not base_url or not session_ids:
raise RuntimeError("即梦视频配置缺少 base_url 或 sessionid")
body = {
"model": model or DEFAULT_MODEL,
"prompt": prompt,
"ratio": ratio or DEFAULT_RATIO,
"resolution": resolution or DEFAULT_RESOLUTION,
"duration": duration or DEFAULT_DURATION,
"response_format": "url",
}
if file_paths:
body["file_paths"] = file_paths
resp = _http_post_json(
f"{base_url}/v1/videos/generations",
body,
{
"Content-Type": "application/json",
"Authorization": f"Bearer {','.join(session_ids)}",
},
timeout=300,
)
urls: list[str] = []
for item in resp.get("data", []):
if isinstance(item, dict):
url = item.get("url")
if isinstance(url, str) and url.strip():
urls.append(url)
return urls
def _parse_cli_params(argv: list[str]) -> dict:
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("--prompt", default="")
parser.add_argument("--model", default="")
parser.add_argument("--file_paths", action="append", default=[])
parser.add_argument("--ratio", default="")
parser.add_argument("--resolution", default="")
parser.add_argument("--duration", type=int, default=0)
namespace, unknown = parser.parse_known_args(argv)
if unknown:
raise ValueError(f"存在不支持的参数: {' '.join(unknown)}")
return {
"prompt": namespace.prompt,
"model": namespace.model,
"file_paths": [path for path in namespace.file_paths if path.strip()],
"ratio": namespace.ratio,
"resolution": namespace.resolution,
"duration": namespace.duration,
}
def main() -> int:
if len(sys.argv) < 2:
sys.stdout.write("缺少输入参数\n")
return 1
try:
params = _parse_cli_params(sys.argv[1:])
except ValueError as exc:
sys.stdout.write(f"参数格式错误: {exc}\n")
return 1
prompt = params.get("prompt", "").strip()
if not prompt:
sys.stdout.write("缺少视频提示词\n")
return 1
model = params.get("model", "").strip()
if not model or model == "none":
model = DEFAULT_MODEL
if model not in SUPPORTED_MODELS:
sys.stdout.write("不支持的 AI 视频模型\n")
return 1
file_paths = params.get("file_paths", [])
if len(file_paths) > 2:
sys.stdout.write("file_paths 最多只能传 2 个\n")
return 1
ratio = params.get("ratio", "").strip() or DEFAULT_RATIO
resolution = params.get("resolution", "").strip() or DEFAULT_RESOLUTION
duration = params.get("duration", 0) or DEFAULT_DURATION
if duration <= 0:
sys.stdout.write("duration 必须大于 0\n")
return 1
from_wx_id = os.environ.get("ROBOT_FROM_WX_ID", "").strip()
if not from_wx_id:
sys.stdout.write("环境变量 ROBOT_FROM_WX_ID 未配置\n")
return 1
try:
conn = _mysql_connect()
except Exception as exc:
sys.stdout.write(f"数据库连接失败: {exc}\n")
return 1
try:
enabled, settings_json = load_drawing_settings(conn, from_wx_id)
except Exception as exc:
sys.stdout.write(f"加载绘图配置失败: {exc}\n")
return 1
finally:
try:
conn.close()
except Exception:
pass
if not enabled:
sys.stdout.write("AI 生成视频未开启\n")
return 0
jimeng_config = _resolve_jimeng_config(settings_json)
if not isinstance(jimeng_config, dict) or not jimeng_config:
sys.stdout.write("未找到即梦视频配置\n")
return 1
if jimeng_config.get("enabled") is False:
sys.stdout.write("即梦视频未开启\n")
return 0
try:
video_urls = call_jimeng_video(
jimeng_config,
prompt,
model,
file_paths,
ratio,
resolution,
duration,
)
except Exception as exc:
sys.stdout.write(f"调用即梦生成视频接口失败: {exc}\n")
return 1
if not video_urls:
sys.stdout.write("未生成任何视频\n")
return 1
try:
send_videos(from_wx_id, video_urls)
sys.stdout.write("ended")
except Exception as exc:
sys.stdout.write(f"发送视频失败: {exc}\n")
return 1
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,206 +0,0 @@
---
name: voice-message
description: "文本转语音与语音消息发送技能。当用户想让我说话、发语音、把一段话转成语音、用某种情绪/音色/语速/方言读出来时使用。支持 content、emotion、voice、style_prompt、voice_prompt、audio_tags、context_texts 等通用参数,并自动把合成结果作为语音消息发给当前会话。"
argument-hint: "需要 content可选 emotion、voice、style_prompt、voice_prompt、audio_tags、context_texts、speaking_rate、pitch、volume、dialect。"
---
# Voice Message Skill
## 描述
这是一个将文本合成为语音并直接发送到当前微信会话的技能。
技能脚本位于 `scripts/voice_message.py`
## 触发条件
- 用户想让你发语音、说一句话、用语音回复。
- 用户说「把这句话读出来」「帮我发个语音」「用开心一点的语气说」。
- 用户要求指定音色、语速、音量、方言、角色感、播报风格或音频标签。
- 用户明确要求文本转语音。
## 入参规范
```json
{
"type": "object",
"properties": {
"content": {
"type": "string",
"description": "要转成语音的文本内容。必须保留用户原意,不要无故扩写。最长 260 个字符。"
},
"emotion": {
"type": "string",
"description": "可选,用户明确要求的情绪或整体风格词,例如 happy、tender、开心、委屈、慵懒、磁性。不要为了适配供应商而改写。"
},
"voice": {
"type": "string",
"description": "可选用户明确指定的音色名、speaker 名或供应商配置中约定的 voice 名称,例如 Chloe、冰糖、mimo_default。不要把“女声”“低沉”这类描述放在这里应放到 voice_prompt。"
},
"voice_prompt": {
"type": "string",
"description": "可选,声线/音色描述,例如“年轻女性,声音清亮,语气温柔但带一点疲惫”。适合文本音色设计,也会作为其他供应商的辅助风格提示。"
},
"context_texts": {
"type": "array",
"items": {
"type": "string"
},
"description": "可选,语音合成辅助信息或对话上下文。仅在需要补充语境、人物状态、说话方式时使用。"
},
"style_prompt": {
"type": "array",
"items": {
"type": "string"
},
"description": "可选,自然语言风格/导演提示,例如“语速稍快,尾音上扬,像刚查到好成绩一样压不住开心”。可重复传入。"
},
"audio_tags": {
"type": "array",
"items": {
"type": "string"
},
"description": "可选,音频标签或整体标签,例如“粤语”“唱歌”“轻笑”“深呼吸”。仅当用户明确要求标签、方言、唱歌、笑声、停顿等细粒度控制时传入。"
},
"speaking_rate": {
"type": "string",
"description": "可选,语速要求,例如“偏慢”“稍快”“像连珠炮”。"
},
"pitch": {
"type": "string",
"description": "可选,音高要求,例如“更低沉”“明亮上扬”。"
},
"volume": {
"type": "string",
"description": "可选,音量或力度要求,例如“小声耳语”“提高音量喊话”。"
},
"dialect": {
"type": "string",
"description": "可选,方言或口音要求,例如“粤语”“四川话”“东北话”“轻微台湾腔”。"
}
},
"required": ["content"],
"additionalProperties": false
}
```
对应命令行参数:
- `--content <文本>` 必填
- `--emotion <情绪/风格>` 可选
- `--voice <音色名或 speaker 名>` 可选
- `--voice_prompt <声线/音色描述>` 可选
- `--style_prompt <自然语言风格提示>` 可选,可重复传入多次
- `--audio_tags <音频标签>` 可选,可重复传入多次
- `--context_texts <辅助文本>` 可选,可重复传入多次
- `--speaking_rate <语速>` 可选
- `--pitch <音高>` 可选
- `--volume <音量>` 可选
- `--dialect <方言/口音>` 可选
## 参数抽取规则
1. `content` 必须来自用户明确想让你说出的内容,不要加入寒暄、解释或额外总结。
2. 如果用户只说“你用语音回复我”但没有提供具体要说的话,应先基于上下文生成一段简洁、自然、适合直接播报的回复,再把这段回复作为 `content`
3. 不要判断当前使用的是哪个语音供应商,也不要为了供应商改写参数;只按用户意图提取通用参数,脚本会自动映射。
4. 只有当用户明确要求情绪或语气时才传 `emotion`。`emotion` 可以是中文或英文短词,不必限制在某个供应商枚举内。
5. 用户指定明确音色名时用 `voice`;用户描述“女声、低沉、御姐音、年轻男性”等声线质感时用 `voice_prompt`
6. 语速、音高、音量、方言有明确要求时优先填 `speaking_rate`、`pitch`、`volume`、`dialect`;复杂演绎要求放入 `style_prompt`
7. `audio_tags` 仅用于用户明确要求唱歌、方言、笑声、停顿、深呼吸等标签化控制时;如果用户已把标签写在 `content` 中,不要重复添加。
8. `context_texts` 适合表达上下文、场景、人物状态和补充播报要求。
9. 不要传递音色复刻音频参数。若当前消息引用了一条语音消息,脚本会通过 `ROBOT_REF_MESSAGE_ID` 自动判断并下载引用语音作为复刻样本。
10. `content` 超过 260 个字符时,不应该调用本技能。
## 音频标签控制
通过在文本中嵌入风格标签与音频标签,直接对语音进行精细控制。开头是整体风格标签,中间可以插入细粒度控制标签。
在目标文本开头添加 `(风格)` 标签,即可指定语音的发音风格。支持同时设置多种风格,将多个风格名称置于同一对括号内,分隔符不限。
支持的括号格式: 可使用半角 `()`、全角 ```[]`
### 格式示例
```
风格类型 风格示例
基础情绪 开心/悲伤/愤怒/恐惧/惊讶/兴奋/委屈/平静/冷漠
复合情绪 怅然/欣慰/无奈/愧疚/释然/嫉妒/厌倦/忐忑/动情
整体语调 温柔/高冷/活泼/严肃/慵懒/俏皮/深沉/干练/凌厉
音色定位 磁性/醇厚/清亮/空灵/稚嫩/苍老/甜美/沙哑/醇雅
人设腔调 夹子音/御姐音/正太音/大叔音/台湾腔
方言 东北话/四川话/河南话/粤语
角色扮演 孙悟空/林黛玉
唱歌 唱歌
```
样例:
- (怅然)这么多年过去了,再走过那条街,心里一下子空了一块。
- (慵懒)再让我睡五分钟……就五分钟,真的,最后一次。
- (磁性)夜已经深了,城市还在呼吸。我是今晚陪你的人,欢迎收听《午夜电台》。
- (东北话)哎呀妈呀,这天儿也忒冷了吧!你说这风,嗖嗖的,跟刀子似的,割脸啊!
- (粤语)呢个真係好正啊!食过一次就唔会忘记!
- (唱歌)原谅我这一生不羁放纵爱自由也会怕有一天会跌倒Oh no。背弃了理想谁人都可以哪会怕有一天只你共我。
在此基础上,我们还支持在文本中任意位置插入 [音频标签]。通过 [音频标签] ,你可以对声音进行细粒度控制,精准调节语气、情绪和表达风格——无论是低声耳语、放声大笑,还是带点小情绪的小吐槽,也可以灵活插入呼吸声,停顿,咳嗽等,都能轻松实现。语速同样可以灵活调整,让每句话都有它该有的节奏。
```
风格类型 风格示例
语速与节奏 吸气/深呼吸/叹气/长叹一口气/喘息/屏息
情绪状态 紧张/害怕/激动/疲惫/委屈/撒娇/心虚/震惊/不耐烦
语音特征 颤抖/声音颤抖/变调/破音/鼻音/气声/沙哑
哭笑表达 笑/轻笑/大笑/冷笑/抽泣/呜咽/哽咽/嚎啕大哭
```
样例:
- (紧张,深呼吸)呼……冷静,冷静。不就是一个面试吗……(语速加快,碎碎念)自我介绍已经背了五十遍了,应该没问题的。加油,你可以的……(小声)哎呀,领带歪没歪?
- (极其疲惫,有气无力)师傅……到地方了叫我一声……(长叹一口气)我先眯一会儿,这班加得我魂儿都要散了。
- 如果我当时……(沉默片刻)哪怕再坚持一秒钟,结果是不是就不一样了?(苦笑)呵,没如果了。
- (寒冷导致的急促呼吸)呼——呼——这、这大兴安岭的雪……(咳嗽)简直能把人骨头冻透了……别、别停下,走,快走。
- (提高音量喊话)大姐!这鱼新鲜着呢!早上刚捞上来的!哎!那个谁,别乱翻,压坏了你赔啊?!
### 特别注意
- 只有`mimo-v2.5-tts`模型支持唱歌模式
- 如需体验更佳的唱歌风格,必须在目标文本最开头添加 `(唱歌)` 标签,格式为:`(唱歌)歌词`。歌词 建议采用中文,可获得更优合成效果。标签内标识支持以下取值,效果等效:`唱歌`、`sing`、`singing`
## 执行步骤
1. 识别用户是否明确需要语音消息。
2. 提取 `content`,可选提取 `emotion`、`voice`、`voice_prompt`、`style_prompt`、`audio_tags`、`context_texts` 等通用控制参数。
3. 在仓库根目录执行:
```bash
python3 scripts/voice_message.py --content '这是一条语音消息' --emotion happy --style_prompt '请自然一点'
```
4. 脚本会读取数据库中的 TTS 配置,按当前供应商能力映射通用参数,调用语音合成接口并通过客户端接口 `POST http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/message/send/voice` 直接发送语音。
## 供应商映射说明
- Doubao`content` 写入文本字段;支持的 `emotion` 写入音频情绪参数;`voice` 可覆盖 speaker其他风格控制会合并到 `context_texts` 辅助信息。
- MiMo V2.5`content` 写入 `assistant` 消息;`style_prompt`、`voice_prompt`、`context_texts`、`emotion`、`speaking_rate`、`pitch`、`volume`、`dialect` 会合并为 `user` 风格/音色控制;`audio_tags` 会作为整体标签加到要合成的文本前。
- MiMo 会默认使用非流式 `wav` 输出;配置中 `stream: true` 时使用 `pcm16` 流式兼容模式并在脚本内封装为 `wav`
- MiMo 在 `auto_model` 未关闭时,会根据 `voice_prompt` 自动选择 `mimo-v2.5-tts-voicedesign`;如果 `ROBOT_REF_MESSAGE_ID` 指向数据库中 `messages.type = 34` 的语音消息,则脚本会调用客户端接口下载该语音 wav并自动选择 `mimo-v2.5-tts-voiceclone`
- 引用消息下载接口为 `GET http://127.0.0.1:{ROBOT_WECHAT_CLIENT_PORT}/api/v1/robot/chat/voice/download?message_id={ROBOT_REF_MESSAGE_ID}`,返回 wav 后由脚本封装为 MiMo 需要的 `data:audio/wav;base64,...`
## 依赖安装
- 脚本首次运行时会自动创建虚拟环境并安装依赖,无需手动执行。
- 如需手动重新安装,可执行:`python3 scripts/bootstrap.py`
## 回复要求
- 成功时脚本输出「ended」表示语音已直接发送无需 AI 智能体再拼装额外消息。
- 失败时,返回脚本输出的具体错误信息。

View File

@ -1,115 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import hashlib
import subprocess
import sys
import traceback
from pathlib import Path
sys.stderr = sys.stdout
def _skill_root_from(script_dir: Path) -> Path:
return script_dir.parent
def _venv_dir(script_dir: Path) -> Path:
return _skill_root_from(script_dir) / ".venv"
def _venv_python(venv_dir: Path) -> Path:
if sys.platform == "win32":
return venv_dir / "Scripts" / "python.exe"
return venv_dir / "bin" / "python"
def _stamp_file(venv_dir: Path) -> Path:
return venv_dir / ".req_hash"
def _file_hash(path: Path) -> str:
return hashlib.sha256(path.read_bytes()).hexdigest()
def _deps_up_to_date(requirements_file: Path, venv_dir: Path) -> bool:
stamp = _stamp_file(venv_dir)
if not stamp.is_file():
return False
return stamp.read_text().strip() == _file_hash(requirements_file)
def _write_stamp(requirements_file: Path, venv_dir: Path) -> None:
_stamp_file(venv_dir).write_text(_file_hash(requirements_file))
def _ensure_venv(venv_dir: Path, venv_python: Path) -> int:
if venv_python.is_file():
return 0
sys.stdout.write(f"未检测到技能虚拟环境,正在创建: {venv_dir}\n")
import shutil
py = sys.executable or next(
(shutil.which(c) for c in ("python3", "python") if shutil.which(c)), None
)
if not py:
raise RuntimeError("无法找到 Python 解释器路径")
command = [py, "-m", "venv", str(venv_dir)]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"创建虚拟环境失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
return 0
def main() -> int:
script_dir = Path(__file__).resolve().parent
requirements_file = script_dir / "requirements.txt"
venv_dir = _venv_dir(script_dir)
venv_python = _venv_python(venv_dir)
if not requirements_file.is_file():
sys.stdout.write(f"未找到依赖文件: {requirements_file}\n")
return 1
ensure_result = _ensure_venv(venv_dir, venv_python)
if ensure_result != 0:
return ensure_result
if _deps_up_to_date(requirements_file, venv_dir):
sys.stdout.write("依赖已是最新,跳过安装\n")
return 0
command = [str(venv_python), "-m", "pip", "install", "--upgrade", "pip"]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"升级 pip 失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
command = [str(venv_python), "-m", "pip", "install", "-r", str(requirements_file)]
try:
subprocess.run(command, check=True, stdout=sys.stdout, stderr=sys.stdout)
except subprocess.CalledProcessError as exc:
sys.stdout.write(f"安装依赖失败,退出码: {exc.returncode}\n")
return exc.returncode or 1
_write_stamp(requirements_file, venv_dir)
sys.stdout.write(f"依赖安装完成,当前技能虚拟环境: {venv_dir}\n")
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)

View File

@ -1,2 +0,0 @@
cryptography
pymysql>=1.1,<2

View File

@ -1,957 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import argparse
import base64
import gzip
import json
import os
import subprocess
import sys
import tempfile
import traceback
import urllib.error
import urllib.parse
import urllib.request
import uuid
import zlib
from pathlib import Path
sys.stderr = sys.stdout
VALID_EMOTIONS = {
"happy",
"sad",
"angry",
"surprised",
"fear",
"hate",
"excited",
"lovey-dovey",
"shy",
"comfort",
"tension",
"tender",
"magnetic",
"vocal-fry",
"ASMR",
}
EMOTION_ALIASES = {
"vocal - fry": "vocal-fry",
}
DEFAULT_SPEAKER = "zh_female_vv_uranus_bigtts"
DEFAULT_AUDIO_FORMAT = "mp3"
DEFAULT_SAMPLE_RATE = 24000
DEFAULT_MIMO_BASE_URL = "https://api.xiaomimimo.com/v1"
DEFAULT_MIMO_MODEL = "mimo-v2.5-tts"
DEFAULT_MIMO_VOICE = "mimo_default"
DEFAULT_MIMO_AUDIO_FORMAT = "wav"
MIMO_STREAM_AUDIO_FORMAT = "pcm16"
MIMO_PCM_SAMPLE_RATE = 24000
MIMO_VOICE_DESIGN_MODEL = "mimo-v2.5-tts-voicedesign"
MIMO_VOICE_CLONE_MODEL = "mimo-v2.5-tts-voiceclone"
WECHAT_VOICE_MESSAGE_TYPE = 34
MAX_CONTENT_LENGTH = 260
STREAM_END_CODE = 20000000
def _skill_root() -> Path:
return Path(__file__).resolve().parent.parent
def _skill_venv_python() -> Path:
venv_dir = _skill_root() / ".venv"
if sys.platform == "win32":
return venv_dir / "Scripts" / "python.exe"
return venv_dir / "bin" / "python"
def _get_python_executable() -> str:
if sys.executable:
return sys.executable
import shutil
for candidate in ("python3", "python"):
found = shutil.which(candidate)
if found:
return found
raise RuntimeError("无法找到 Python 解释器路径")
def _run_bootstrap() -> None:
bootstrap = Path(__file__).resolve().parent / "bootstrap.py"
result = subprocess.run([_get_python_executable(), str(bootstrap)])
if result.returncode != 0:
raise SystemExit(result.returncode)
def _ensure_skill_venv_python() -> None:
venv_python = _skill_venv_python()
if not venv_python.is_file():
_run_bootstrap()
venv_python = _skill_venv_python()
if not venv_python.is_file():
sys.stdout.write("bootstrap 后仍未找到虚拟环境\n")
raise SystemExit(1)
venv_dir = _skill_root() / ".venv"
if Path(sys.prefix) == venv_dir.resolve():
return
os.execv(str(venv_python), [str(venv_python), str(Path(__file__).resolve()), *sys.argv[1:]])
_ensure_skill_venv_python()
try:
import pymysql # type: ignore # noqa: E402
except ModuleNotFoundError:
_run_bootstrap()
_py = _get_python_executable()
os.execv(_py, [_py, str(Path(__file__).resolve()), *sys.argv[1:]])
def _mysql_connect():
host = os.environ.get("MYSQL_HOST", "127.0.0.1")
port = int(os.environ.get("MYSQL_PORT", "3306"))
user = os.environ.get("MYSQL_USER", "root")
password = os.environ.get("MYSQL_PASSWORD", "")
database = os.environ.get("ROBOT_CODE", "")
if not database:
raise RuntimeError("环境变量 ROBOT_CODE 未配置")
return pymysql.connect(
host=host,
port=port,
user=user,
password=password,
database=database,
charset="utf8mb4",
connect_timeout=10,
read_timeout=300,
write_timeout=300,
)
def _query_one(conn, sql: str, params: tuple = ()) -> dict | None:
cur = conn.cursor()
cur.execute(sql, params)
columns = [desc[0] for desc in cur.description] if cur.description else []
row = cur.fetchone()
cur.close()
if row is None:
return None
return dict(zip(columns, row))
def _load_json_field(raw: object) -> dict:
if raw is None:
return {}
if isinstance(raw, (bytes, bytearray)):
raw = raw.decode("utf-8")
if isinstance(raw, str):
if not raw.strip():
return {}
value = json.loads(raw)
return value if isinstance(value, dict) else {}
if isinstance(raw, dict):
return raw
return {}
def load_tts_settings(conn, from_wx_id: str) -> tuple[bool, str, dict, str, str]:
global_row = _query_one(
conn,
"SELECT tts_enabled, tts_model, tts_settings, chat_base_url, chat_api_key FROM global_settings LIMIT 1",
)
enabled = False
tts_model: str = "doubao"
settings_json: dict = {}
fallback_base_url: str = ""
fallback_api_key: str = ""
if global_row:
if global_row.get("tts_enabled") is not None:
enabled = bool(global_row["tts_enabled"])
if global_row.get("tts_model"):
tts_model = str(global_row["tts_model"]).strip() or "doubao"
settings_json = _load_json_field(global_row.get("tts_settings"))
fallback_base_url = str(global_row.get("chat_base_url") or "").strip()
fallback_api_key = str(global_row.get("chat_api_key") or "").strip()
if from_wx_id.endswith("@chatroom"):
override = _query_one(
conn,
"SELECT tts_enabled, tts_model, tts_settings, chat_base_url, chat_api_key FROM chat_room_settings WHERE chat_room_id = %s LIMIT 1",
(from_wx_id,),
)
else:
override = _query_one(
conn,
"SELECT tts_enabled, tts_model, tts_settings, chat_base_url, chat_api_key FROM friend_settings WHERE wechat_id = %s LIMIT 1",
(from_wx_id,),
)
if override:
if override.get("tts_enabled") is not None:
enabled = bool(override["tts_enabled"])
if override.get("tts_model"):
tts_model = str(override["tts_model"]).strip() or tts_model
override_settings = _load_json_field(override.get("tts_settings"))
if override_settings:
settings_json = override_settings
if str(override.get("chat_base_url") or "").strip():
fallback_base_url = str(override["chat_base_url"]).strip()
if str(override.get("chat_api_key") or "").strip():
fallback_api_key = str(override["chat_api_key"]).strip()
return enabled, tts_model, settings_json, fallback_base_url, fallback_api_key
def _clean_text(value: object) -> str:
return str(value or "").strip()
def _clean_text_list(values: object) -> list[str]:
if not isinstance(values, list):
return []
return [item for item in (_clean_text(value) for value in values) if item]
def _coerce_bool(value: object, default: bool = False) -> bool:
if value is None:
return default
if isinstance(value, bool):
return value
if isinstance(value, (int, float)):
return bool(value)
if isinstance(value, str):
normalized = value.strip().lower()
if normalized in {"1", "true", "yes", "y", "on"}:
return True
if normalized in {"0", "false", "no", "n", "off"}:
return False
return default
def _normalize_emotion(emotion: str) -> str:
normalized = EMOTION_ALIASES.get(emotion.strip(), emotion.strip())
return normalized if normalized in VALID_EMOTIONS else ""
def _download_referenced_voice_clone(message_id: str) -> str:
client_port = os.environ.get("ROBOT_WECHAT_CLIENT_PORT", "").strip()
if not client_port:
raise RuntimeError("环境变量 ROBOT_WECHAT_CLIENT_PORT 未配置")
encoded_message_id = urllib.parse.quote(message_id, safe="")
download_url = (
f"http://127.0.0.1:{client_port}/api/v1/robot/chat/voice/download"
f"?message_id={encoded_message_id}"
)
req = urllib.request.Request(download_url, method="GET")
try:
with urllib.request.urlopen(req, timeout=60) as response:
wav_data = response.read()
except urllib.error.HTTPError as exc:
error_body = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"下载引用语音失败,状态码 {exc.code}: {error_body}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"下载引用语音失败: {exc}") from exc
if not wav_data:
raise RuntimeError("下载引用语音失败: 响应为空")
audio_b64 = base64.b64encode(wav_data).decode("utf-8")
return f"data:audio/wav;base64,{audio_b64}"
def _load_referenced_voice_clone(conn) -> str:
ref_message_id = os.environ.get("ROBOT_REF_MESSAGE_ID", "").strip()
if not ref_message_id:
return ""
message = _query_one(conn, "SELECT * FROM messages WHERE msg_id = %s LIMIT 1", (ref_message_id,))
if not message:
return ""
try:
message_type = int(message.get("type") or 0)
except (TypeError, ValueError):
return ""
if message_type != WECHAT_VOICE_MESSAGE_TYPE:
return ""
return _download_referenced_voice_clone(ref_message_id)
def _parse_cli_params(argv: list[str]) -> dict:
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument("--content", default="")
parser.add_argument("--emotion", default="")
parser.add_argument("--context_texts", action="append", default=[])
parser.add_argument("--voice", default="")
parser.add_argument("--style_prompt", action="append", default=[])
parser.add_argument("--voice_prompt", default="")
parser.add_argument("--audio_tags", action="append", default=[])
parser.add_argument("--speaking_rate", default="")
parser.add_argument("--pitch", default="")
parser.add_argument("--volume", default="")
parser.add_argument("--dialect", default="")
namespace, unknown = parser.parse_known_args(argv)
if unknown:
raise ValueError(f"存在不支持的参数: {' '.join(unknown)}")
return {
"content": namespace.content,
"emotion": _clean_text(namespace.emotion),
"context_texts": _clean_text_list(namespace.context_texts),
"voice": _clean_text(namespace.voice),
"style_prompt": _clean_text_list(namespace.style_prompt),
"voice_prompt": _clean_text(namespace.voice_prompt),
"audio_tags": _clean_text_list(namespace.audio_tags),
"speaking_rate": _clean_text(namespace.speaking_rate),
"pitch": _clean_text(namespace.pitch),
"volume": _clean_text(namespace.volume),
"dialect": _clean_text(namespace.dialect),
}
def _build_request_headers(config: dict) -> dict[str, str]:
request_header = config.get("request_header") or {}
if not isinstance(request_header, dict):
raise RuntimeError("request_header 配置格式错误")
app_id = str(request_header.get("X-Api-App-Id") or "").strip()
access_key = str(request_header.get("X-Api-Access-Key") or "").strip()
resource_id = str(request_header.get("X-Api-Resource-Id") or "").strip()
if not app_id or not access_key or not resource_id:
raise RuntimeError("请求头参数不能为空")
headers = {
"Content-Type": "application/json",
"X-Api-App-Id": app_id,
"X-Api-Access-Key": access_key,
"X-Api-Resource-Id": resource_id,
}
request_id = str(request_header.get("X-Api-Request-Id") or "").strip()
if request_id:
headers["X-Api-Request-Id"] = request_id
usage_header = str(request_header.get("X-Control-Require-Usage-Tokens-Return") or "").strip()
if usage_header:
headers["X-Control-Require-Usage-Tokens-Return"] = usage_header
return headers
def _build_control_texts(params: dict) -> list[str]:
controls = list(params.get("context_texts") or [])
controls.extend(params.get("style_prompt") or [])
labeled_fields = [
("emotion", "情绪/风格"),
("voice_prompt", "音色描述"),
("speaking_rate", "语速"),
("pitch", "音高"),
("volume", "音量"),
("dialect", "方言/口音"),
]
for field_name, label in labeled_fields:
value = _clean_text(params.get(field_name))
if value:
controls.append(f"{label}: {value}")
for tag in params.get("audio_tags") or []:
controls.append(f"音频标签: {tag}")
return [item for item in controls if item]
def _build_request_body(config: dict, params: dict) -> dict:
request_body = config.get("request_body") or {}
if not isinstance(request_body, dict):
raise RuntimeError("request_body 配置格式错误")
content = params.get("content", "")
body = json.loads(json.dumps(request_body))
user = body.setdefault("user", {})
if not isinstance(user, dict):
raise RuntimeError("user 配置格式错误")
user["uid"] = str(uuid.uuid4())
req_params = body.setdefault("req_params", {})
if not isinstance(req_params, dict):
raise RuntimeError("req_params 配置格式错误")
voice = _clean_text(params.get("voice"))
if voice:
req_params["speaker"] = voice
elif not str(req_params.get("speaker") or "").strip():
req_params["speaker"] = DEFAULT_SPEAKER
req_params["text"] = content
audio_params = req_params.setdefault("audio_params", {})
if not isinstance(audio_params, dict):
raise RuntimeError("audio_params 配置格式错误")
audio_params["format"] = DEFAULT_AUDIO_FORMAT
audio_params["sample_rate"] = DEFAULT_SAMPLE_RATE
emotion = _normalize_emotion(_clean_text(params.get("emotion")))
if emotion:
audio_params["emotion"] = emotion
audio_params["emotion_scale"] = 5
additions = req_params.setdefault("x-additions", {})
if not isinstance(additions, dict):
raise RuntimeError("x-additions 配置格式错误")
context_texts = _build_control_texts(params)
if context_texts:
additions["context_texts"] = context_texts
return body
def synthesize_audio(config: dict, params: dict) -> tuple[bytes, str]:
url = str(config.get("url") or "").strip()
if not url:
raise RuntimeError("语音合成地址不能为空")
request_headers = _build_request_headers(config)
request_body = _build_request_body(config, params)
request_data = json.dumps(request_body).encode("utf-8")
req = urllib.request.Request(url, data=request_data, headers=request_headers, method="POST")
try:
response = urllib.request.urlopen(req, timeout=300)
except urllib.error.HTTPError as exc:
error_body = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"API请求失败状态码 {exc.code}: {error_body}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"发送请求失败: {exc}") from exc
audio_chunks = bytearray()
audio_format = str(
((request_body.get("req_params") or {}).get("audio_params") or {}).get("format") or DEFAULT_AUDIO_FORMAT
).strip() or DEFAULT_AUDIO_FORMAT
with response:
for raw_line in response:
line = raw_line.decode("utf-8", errors="replace").strip()
if not line:
continue
if line.startswith("data:"):
line = line[5:].strip()
if not line:
continue
try:
payload = json.loads(line)
except json.JSONDecodeError as exc:
raise RuntimeError(f"解析响应失败: {exc}, 行内容: {line}") from exc
code = int(payload.get("code") or 0)
message = str(payload.get("message") or "")
audio_b64 = payload.get("data")
if code == 0 and isinstance(audio_b64, str) and audio_b64:
try:
audio_chunks.extend(base64.b64decode(audio_b64))
except Exception as exc:
raise RuntimeError(f"解码音频数据失败: {exc}") from exc
continue
if code == 0 and isinstance(payload.get("sentence"), dict):
continue
if code == STREAM_END_CODE:
break
if code > 0:
raise RuntimeError(f"合成失败,错误码: {code}, 错误信息: {message}")
if not audio_chunks:
raise RuntimeError("未接收到音频数据")
return bytes(audio_chunks), audio_format
def _pcm16le_to_wav(pcm_data: bytes, sample_rate: int = 24000, channels: int = 1) -> bytes:
import struct
data_size = len(pcm_data)
byte_rate = sample_rate * channels * 2
block_align = channels * 2
header = struct.pack(
"<4sI4s4sIHHIIHH4sI",
b"RIFF",
36 + data_size,
b"WAVE",
b"fmt ",
16,
1,
channels,
sample_rate,
byte_rate,
block_align,
16,
b"data",
data_size,
)
return header + pcm_data
def _config_texts(config: dict, key: str) -> list[str]:
value = config.get(key)
if isinstance(value, list):
return _clean_text_list(value)
text = _clean_text(value)
return [text] if text else []
def _resolve_mimo_model(config: dict, params: dict) -> str:
configured_model = _clean_text(config.get("model"))
if _clean_text(params.get("voice_clone_audio")):
return MIMO_VOICE_CLONE_MODEL
auto_model = _coerce_bool(config.get("auto_model"), True)
if auto_model and _clean_text(config.get("voice_clone_audio")):
return MIMO_VOICE_CLONE_MODEL
if auto_model and (_clean_text(params.get("voice_prompt")) or _clean_text(config.get("voice_prompt"))):
return MIMO_VOICE_DESIGN_MODEL
if configured_model:
return configured_model
return DEFAULT_MIMO_MODEL
def _format_mimo_audio_tags(tags: list[str]) -> str:
cleaned_tags = [tag.strip("()[] ") for tag in tags if tag.strip("()[] ")]
if not cleaned_tags:
return ""
return f"({' '.join(cleaned_tags)})"
def _build_mimo_assistant_content(params: dict) -> str:
content = _clean_text(params.get("content"))
tags = _format_mimo_audio_tags(params.get("audio_tags") or [])
return f"{tags}{content}" if tags else content
def _build_mimo_user_content(config: dict, params: dict, model: str) -> str:
parts: list[str] = []
voice_prompt = _clean_text(params.get("voice_prompt")) or _clean_text(config.get("voice_prompt"))
if voice_prompt:
if model == MIMO_VOICE_DESIGN_MODEL:
parts.append(voice_prompt)
else:
parts.append(f"音色/声线: {voice_prompt}")
parts.extend(_config_texts(config, "style_prompt"))
parts.extend(params.get("style_prompt") or [])
parts.extend(_config_texts(config, "context_texts"))
parts.extend(params.get("context_texts") or [])
labeled_fields = [
("emotion", "情绪/风格"),
("speaking_rate", "语速"),
("pitch", "音高"),
("volume", "音量"),
("dialect", "方言/口音"),
]
for field_name, label in labeled_fields:
value = _clean_text(params.get(field_name)) or _clean_text(config.get(field_name))
if value:
parts.append(f"{label}: {value}")
if model == MIMO_VOICE_DESIGN_MODEL and not parts:
raise RuntimeError("mimo 文本音色设计模型需要 voice_prompt 或 style_prompt")
return "\n".join(parts)
def _resolve_mimo_voice(config: dict, params: dict, model: str) -> str:
if model == MIMO_VOICE_DESIGN_MODEL:
return ""
if model == MIMO_VOICE_CLONE_MODEL:
voice_clone_audio = _clean_text(params.get("voice_clone_audio")) or _clean_text(config.get("voice_clone_audio"))
if not voice_clone_audio:
raise RuntimeError("mimo 音色复刻模型需要引用一条语音消息或配置 voice_clone_audio")
if voice_clone_audio.startswith("data:"):
return voice_clone_audio
mime_type = (
_clean_text(params.get("voice_clone_mime_type"))
or _clean_text(config.get("voice_clone_mime_type"))
or "audio/mpeg"
)
return f"data:{mime_type};base64,{voice_clone_audio}"
return _clean_text(params.get("voice")) or _clean_text(config.get("voice")) or DEFAULT_MIMO_VOICE
def _build_mimo_payload(config: dict, params: dict) -> tuple[dict, str, bool]:
model = _resolve_mimo_model(config, params)
stream = _coerce_bool(config.get("stream"), False)
audio_format = MIMO_STREAM_AUDIO_FORMAT if stream else (
_clean_text(config.get("audio_format")) or _clean_text(config.get("format")) or DEFAULT_MIMO_AUDIO_FORMAT
)
messages = []
user_content = _build_mimo_user_content(config, params, model)
if user_content or model == MIMO_VOICE_CLONE_MODEL:
messages.append({"role": "user", "content": user_content})
messages.append({"role": "assistant", "content": _build_mimo_assistant_content(params)})
audio = {"format": audio_format}
voice = _resolve_mimo_voice(config, params, model)
if voice:
audio["voice"] = voice
payload = {
"model": model,
"messages": messages,
"audio": audio,
}
if stream:
payload["stream"] = True
return payload, audio_format, stream
def _decompress_response_bytes(raw: bytes, encoding: str) -> bytes:
encoding = (encoding or "").strip().lower()
if not encoding or encoding == "identity":
return raw
if encoding == "gzip":
return gzip.decompress(raw)
if encoding == "deflate":
try:
return zlib.decompress(raw)
except zlib.error:
return zlib.decompress(raw, -zlib.MAX_WBITS)
if encoding == "br":
try:
import brotli # type: ignore
except ModuleNotFoundError as exc:
raise RuntimeError(
"mimo 响应使用了 brotli 压缩,但当前环境未安装 brotli请安装后重试"
) from exc
return brotli.decompress(raw)
raise RuntimeError(f"mimo 响应使用了不支持的 Content-Encoding: {encoding}")
def _read_response_text(response) -> str:
raw = response.read()
encoding = response.headers.get("Content-Encoding", "")
raw = _decompress_response_bytes(raw, encoding)
return raw.decode("utf-8", errors="replace")
def _decode_mimo_audio(audio_b64: object, audio_format: str) -> tuple[bytes, str]:
if not isinstance(audio_b64, str) or not audio_b64:
raise RuntimeError("mimo 响应未包含音频数据")
try:
audio_bytes = base64.b64decode(audio_b64)
except Exception as exc:
raise RuntimeError(f"解码 mimo 音频数据失败: {exc}") from exc
if audio_format == MIMO_STREAM_AUDIO_FORMAT:
return _pcm16le_to_wav(audio_bytes, sample_rate=MIMO_PCM_SAMPLE_RATE), "wav"
return audio_bytes, audio_format
def _read_mimo_non_stream_response(response, audio_format: str) -> tuple[bytes, str]:
raw_body = _read_response_text(response)
try:
payload = json.loads(raw_body)
except json.JSONDecodeError as exc:
snippet = raw_body[:300]
if "<html" in raw_body.lower() or "<!doctype" in raw_body.lower():
raise RuntimeError(
"mimo 响应不是 JSON疑似 base_url 配置错误(被网关前端 SPA 拦截),"
"请检查 base_url 是否配置为带 /v1 的完整地址,例如 https://api.xiaomimimo.com/v1。"
f"响应片段: {snippet}"
) from exc
raise RuntimeError(f"解析 mimo 响应失败: {exc}, 响应内容: {snippet}") from exc
if isinstance(payload.get("error"), dict):
error = payload["error"]
message = _clean_text(error.get("message")) or json.dumps(error, ensure_ascii=False)
raise RuntimeError(f"mimo 合成失败: {message}")
choices = payload.get("choices") or []
if not choices:
raise RuntimeError(f"mimo 响应缺少 choices: {raw_body}")
message = choices[0].get("message") or {}
audio = message.get("audio") or {}
audio_b64 = audio.get("data") if isinstance(audio, dict) else None
return _decode_mimo_audio(audio_b64, audio_format)
def _read_mimo_stream_response(response) -> tuple[bytes, str]:
pcm_chunks = bytearray()
with response:
for raw_line in response:
line = raw_line.decode("utf-8", errors="replace").strip()
if not line or not line.startswith("data:"):
continue
data_str = line[5:].strip()
if data_str == "[DONE]":
break
try:
chunk = json.loads(data_str)
except json.JSONDecodeError:
continue
if isinstance(chunk.get("error"), dict):
message = _clean_text(chunk["error"].get("message")) or json.dumps(chunk["error"], ensure_ascii=False)
raise RuntimeError(f"mimo 合成失败: {message}")
choices = chunk.get("choices") or []
if not choices:
continue
delta = choices[0].get("delta") or {}
audio = delta.get("audio") or {}
audio_data_b64 = audio.get("data") if isinstance(audio, dict) else None
if audio_data_b64:
try:
pcm_chunks.extend(base64.b64decode(audio_data_b64))
except Exception as exc:
raise RuntimeError(f"解码 mimo 音频数据失败: {exc}") from exc
if not pcm_chunks:
raise RuntimeError("mimo 未接收到音频数据")
return _pcm16le_to_wav(bytes(pcm_chunks), sample_rate=MIMO_PCM_SAMPLE_RATE), "wav"
def synthesize_audio_mimo(config: dict, params: dict) -> tuple[bytes, str]:
api_key = str(config.get("api_key") or "").strip()
base_url = str(config.get("base_url") or DEFAULT_MIMO_BASE_URL).strip().rstrip("/")
if not api_key:
raise RuntimeError("mimo api_key 不能为空")
# 兼容用户把 base_url 配成不带 /v1 的根地址(如 New API / OneAPI 等网关),
# 避免请求被前端 SPA 兜底返回 index.html。
parsed_base = urllib.parse.urlsplit(base_url)
base_path = parsed_base.path or ""
if not base_path or base_path == "/":
base_url = f"{base_url}/v1"
url = f"{base_url}/chat/completions"
payload, audio_format, stream = _build_mimo_payload(config, params)
request_data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
req = urllib.request.Request(
url,
data=request_data,
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
"Accept": "application/json, text/event-stream",
"Accept-Encoding": "identity",
},
method="POST",
)
try:
response = urllib.request.urlopen(req, timeout=300)
except urllib.error.HTTPError as exc:
try:
error_body = _read_response_text(exc)
except Exception:
error_body = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"mimo API请求失败状态码 {exc.code}: {error_body}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"mimo 发送请求失败: {exc}") from exc
if stream:
return _read_mimo_stream_response(response)
with response:
return _read_mimo_non_stream_response(response, audio_format)
def _guess_mime_type(audio_format: str) -> str:
fmt = audio_format.lower()
if fmt == "mp3":
return "audio/mpeg"
if fmt == "wav":
return "audio/wav"
if fmt == "amr":
return "audio/amr"
return "application/octet-stream"
def _encode_multipart_formdata(fields: dict[str, str], files: list[tuple[str, str, bytes, str]]) -> tuple[bytes, str]:
boundary = f"----wechatrobot{uuid.uuid4().hex}"
chunks: list[bytes] = []
for name, value in fields.items():
chunks.extend(
[
f"--{boundary}\r\n".encode("utf-8"),
f'Content-Disposition: form-data; name="{name}"\r\n\r\n'.encode("utf-8"),
value.encode("utf-8"),
b"\r\n",
]
)
for field_name, filename, data, content_type in files:
chunks.extend(
[
f"--{boundary}\r\n".encode("utf-8"),
(
f'Content-Disposition: form-data; name="{field_name}"; '
f'filename="{filename}"\r\n'
).encode("utf-8"),
f"Content-Type: {content_type}\r\n\r\n".encode("utf-8"),
data,
b"\r\n",
]
)
chunks.append(f"--{boundary}--\r\n".encode("utf-8"))
return b"".join(chunks), boundary
def send_voice(from_wx_id: str, audio_data: bytes, audio_format: str) -> None:
client_port = os.environ.get("ROBOT_WECHAT_CLIENT_PORT", "").strip()
if not client_port:
raise RuntimeError("环境变量 ROBOT_WECHAT_CLIENT_PORT 未配置")
send_url = f"http://127.0.0.1:{client_port}/api/v1/robot/message/send/voice"
suffix = f".{audio_format.lower() or DEFAULT_AUDIO_FORMAT}"
with tempfile.NamedTemporaryFile(prefix="voice-message-", suffix=suffix, delete=False) as temp_file:
temp_file.write(audio_data)
temp_path = Path(temp_file.name)
try:
file_bytes = temp_path.read_bytes()
body, boundary = _encode_multipart_formdata(
{"to_wxid": from_wx_id},
[("voice", temp_path.name, file_bytes, _guess_mime_type(audio_format))],
)
req = urllib.request.Request(
send_url,
data=body,
headers={"Content-Type": f"multipart/form-data; boundary={boundary}"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=60) as resp:
resp.read()
except urllib.error.HTTPError as exc:
error_body = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"发送语音失败,状态码 {exc.code}: {error_body}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(f"发送语音失败: {exc}") from exc
finally:
try:
temp_path.unlink(missing_ok=True)
except Exception:
pass
def main() -> int:
if len(sys.argv) < 2:
sys.stdout.write("缺少输入参数\n")
return 1
try:
params = _parse_cli_params(sys.argv[1:])
except ValueError as exc:
sys.stdout.write(f"参数格式错误: {exc}\n")
return 1
content = params.get("content", "").strip()
if not content:
sys.stdout.write("文本转语音的输入文本不能为空\n")
return 1
if len(content) > MAX_CONTENT_LENGTH:
sys.stdout.write("你要说的也太多了,要不你还是说点别的吧。\n")
return 1
from_wx_id = os.environ.get("ROBOT_FROM_WX_ID", "").strip()
if not from_wx_id:
sys.stdout.write("环境变量 ROBOT_FROM_WX_ID 未配置\n")
return 1
try:
conn = _mysql_connect()
except Exception as exc:
sys.stdout.write(f"数据库连接失败: {exc}\n")
return 1
try:
try:
enabled, tts_model, tts_settings, fallback_base_url, fallback_api_key = load_tts_settings(conn, from_wx_id)
except Exception as exc:
sys.stdout.write(f"加载文本转语音配置失败: {exc}\n")
return 1
try:
if tts_model == "mimo":
voice_clone_audio = _load_referenced_voice_clone(conn)
if voice_clone_audio:
params = dict(params)
params["voice_clone_audio"] = voice_clone_audio
except Exception as exc:
sys.stdout.write(f"加载引用语音失败: {exc}\n")
return 1
finally:
try:
conn.close()
except Exception:
pass
if not enabled:
sys.stdout.write("文本转语音未开启\n")
return 0
if not isinstance(tts_settings, dict) or not tts_settings:
sys.stdout.write("未找到文本转语音配置\n")
return 1
model_config = tts_settings.get(tts_model)
if not isinstance(model_config, dict) or not model_config:
sys.stdout.write(f"未找到 {tts_model} 的文本转语音配置\n")
return 1
try:
if tts_model == "doubao":
audio_data, audio_format = synthesize_audio(model_config, params)
elif tts_model == "mimo":
if not str(model_config.get("api_key") or "").strip() and fallback_api_key:
model_config = dict(model_config)
model_config["api_key"] = fallback_api_key
if not str(model_config.get("base_url") or "").strip() and fallback_base_url:
model_config = dict(model_config)
model_config["base_url"] = fallback_base_url
audio_data, audio_format = synthesize_audio_mimo(model_config, params)
else:
sys.stdout.write(f"未知的 TTS 模型: {tts_model}\n")
return 1
except Exception as exc:
sys.stdout.write(f"语音合成失败: {exc}\n")
return 1
try:
send_voice(from_wx_id, audio_data, audio_format)
sys.stdout.write("ended")
except Exception as exc:
sys.stdout.write(f"发送语音失败: {exc}\n")
return 1
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except SystemExit:
raise
except Exception:
traceback.print_exc(file=sys.stdout)
raise SystemExit(1)