修复缓存命中bug

This commit is contained in:
h88782481 2026-03-15 13:52:09 +08:00
parent 98f3ae24a0
commit 72223ef412
3 changed files with 178 additions and 0 deletions

View file

@ -43,6 +43,8 @@ from routes.common import (
build_route_context, build_route_context,
chat_error_chunk, chat_error_chunk,
ensure_responses_cache_control, ensure_responses_cache_control,
attach_previous_response_id,
remember_response_id,
inject_instructions_anthropic, inject_instructions_anthropic,
inject_instructions_cc, inject_instructions_cc,
inject_instructions_responses, inject_instructions_responses,
@ -313,6 +315,7 @@ def _handle_responses_backend(ctx: RouteContext, payload: dict[str, Any], turn:
responses_payload['model'] = ctx.upstream_model responses_payload['model'] = ctx.upstream_model
responses_payload = inject_instructions_responses(responses_payload, ctx.custom_instructions, ctx.instructions_position) responses_payload = inject_instructions_responses(responses_payload, ctx.custom_instructions, ctx.instructions_position)
responses_payload = ensure_responses_cache_control(responses_payload) responses_payload = ensure_responses_cache_control(responses_payload)
responses_payload = attach_previous_response_id(responses_payload)
_dbg( _dbg(
'已转换为 Responses 请求:字段=' + str(list(responses_payload.keys())) '已转换为 Responses 请求:字段=' + str(list(responses_payload.keys()))
+ f' 输入项数={len(responses_payload.get("input", []))}' + f' 输入项数={len(responses_payload.get("input", []))}'
@ -347,6 +350,7 @@ def _handle_responses_non_stream(
attach_upstream_response(turn, raw) attach_upstream_response(turn, raw)
_dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000]) _dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000])
remember_response_id(payload, raw)
data = responses_to_cc_response(raw, ctx.client_model) data = responses_to_cc_response(raw, ctx.client_model)
return _finalize_chat_response(ctx, data, turn=turn, debug_label='Responses 转回聊天补全后') return _finalize_chat_response(ctx, data, turn=turn, debug_label='Responses 转回聊天补全后')
@ -385,6 +389,10 @@ def _handle_responses_stream(
'completion_tokens': extracted_usage.get('output_tokens', 0), 'completion_tokens': extracted_usage.get('output_tokens', 0),
'total_tokens': extracted_usage.get('total_tokens', 0), 'total_tokens': extracted_usage.get('total_tokens', 0),
} }
if event_type == 'response.completed':
response_obj = event_data.get('response') if isinstance(event_data, dict) else None
if isinstance(response_obj, dict):
remember_response_id(payload, response_obj)
if event_count < 10: if event_count < 10:
_dbg( _dbg(
f'上游事件#{event_count} 类型={event_type} 数据=' f'上游事件#{event_count} 类型={event_type} 数据='

View file

@ -7,8 +7,11 @@ SSE 消息拼装逻辑,避免 `chat.py` 和 `responses.py` 各自维护重复
from __future__ import annotations from __future__ import annotations
from dataclasses import dataclass from dataclasses import dataclass
import hashlib
import json import json
import logging import logging
import threading
import time
from typing import Any from typing import Any
import settings import settings
@ -16,6 +19,10 @@ from utils.http import build_anthropic_headers, build_gemini_headers, build_open
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
_RESPONSES_PREV_ID_LOCK = threading.Lock()
_RESPONSES_PREV_ID_TTL = 86400
_RESPONSES_PREV_IDS: dict[str, tuple[str, float]] = {}
@dataclass(frozen=True) @dataclass(frozen=True)
class RouteContext: class RouteContext:
@ -212,6 +219,161 @@ def ensure_responses_cache_control(payload: dict[str, Any]) -> dict[str, Any]:
return payload return payload
def attach_previous_response_id(payload: dict[str, Any]) -> dict[str, Any]:
"""为多轮 Responses 请求补齐上一轮 response_id。
某些上游在 `/v1/responses` 多轮场景下只有沿用 `previous_response_id` 才能稳定复用
上一轮的服务端响应链与缓存Cursor 通常会回传完整历史但不会主动带这个字段
因此代理需要基于稳定对话键做一次轻量补齐
"""
if not isinstance(payload, dict) or payload.get('previous_response_id'):
return payload
key = _responses_prev_id_key(payload)
if not key:
return payload
previous_response_id = _get_previous_response_id(key)
if not previous_response_id:
return payload
payload['previous_response_id'] = previous_response_id
logger.info('已为 Responses 请求补齐 previous_response_id')
return payload
def remember_response_id(payload: dict[str, Any], response_data: dict[str, Any]) -> None:
"""记住当前对话最近一次上游 Responses response_id。"""
if not isinstance(payload, dict) or not isinstance(response_data, dict):
return
response_id = response_data.get('id')
if not isinstance(response_id, str) or not response_id.strip():
return
key = _responses_prev_id_key(payload)
if not key:
return
with _RESPONSES_PREV_ID_LOCK:
_RESPONSES_PREV_IDS[key] = (response_id.strip(), time.time())
_cleanup_previous_response_ids_locked()
def _responses_prev_id_key(payload: dict[str, Any]) -> str:
"""基于 Responses 请求的“对话根信息”生成稳定键。
这里故意不直接使用完整 `input` 作为键因为多轮对话每轮都会追加历史
如果把整段历史都纳入哈希键会在每一轮变化导致无法稳定取回上一轮的
`previous_response_id`当前策略只取 instructions 与首轮 user/assistant 根消息
"""
instructions = payload.get('instructions') or ''
input_data = payload.get('input', [])
if isinstance(input_data, str):
seed_input = input_data
elif isinstance(input_data, list):
seed_input = _responses_root_seed_from_items(input_data)
else:
seed_input = json.dumps(input_data, ensure_ascii=False, default=str)
raw = instructions + '|' + seed_input
if not raw.strip('|'):
return ''
return hashlib.sha256(raw.encode('utf-8')).hexdigest()[:24]
def _responses_root_seed_from_items(items: list[Any]) -> str:
"""从 Responses `input` 中提取足够稳定的对话根片段。
目标不是完整还原会话而是构造一个在同一段对话内尽量恒定跨轮次可复用的
seed这里沿用项目里 conversation seed 的思路优先取第一条 user 与第一条
assistant如果 assistant 还不存在则只用第一条 user
"""
first_user = None
first_assistant = None
for item in items:
if isinstance(item, str):
if first_user is None:
first_user = {'role': 'user', 'content': item}
continue
if not isinstance(item, dict):
continue
item_type = item.get('type', '')
role = item.get('role', '')
if item_type == 'message' and role in ('user', 'assistant'):
normalized = {
'role': role,
'content': _responses_normalize_content(item.get('content', [])),
}
if role == 'user' and first_user is None:
first_user = normalized
elif role == 'assistant' and first_assistant is None:
first_assistant = normalized
elif role in ('user', 'assistant') and not item_type:
normalized = {
'role': role,
'content': _responses_normalize_content(item.get('content', '')),
}
if role == 'user' and first_user is None:
first_user = normalized
elif role == 'assistant' and first_assistant is None:
first_assistant = normalized
if first_user is not None and first_assistant is not None:
break
parts = []
if first_user is not None:
parts.append(first_user)
if first_assistant is not None:
parts.append(first_assistant)
return json.dumps(parts, ensure_ascii=False, separators=(',', ':'))
def _responses_normalize_content(content: Any) -> str:
"""把 Responses 各种 content 形态折叠成稳定文本。
这里的目标不是保真展示而是降低结构差异对 key 计算的影响只抽取会影响
会话根语义的文本型内容忽略无关字段避免同一轮请求因格式细节不同而得到
不同的 previous_response_id
"""
if isinstance(content, str):
return content.strip()
if not isinstance(content, list):
return str(content).strip() if content is not None else ''
texts: list[str] = []
for part in content:
if isinstance(part, str):
texts.append(part)
continue
if not isinstance(part, dict):
continue
if part.get('type') in ('input_text', 'output_text', 'text'):
texts.append(part.get('text', ''))
elif part.get('type') == 'summary_text':
texts.append(part.get('text', ''))
return '\n'.join(texts).strip()
def _get_previous_response_id(key: str) -> str:
"""按稳定键读取上一轮 response_id并在过期时顺手清理。"""
with _RESPONSES_PREV_ID_LOCK:
entry = _RESPONSES_PREV_IDS.get(key)
if not entry:
return ''
response_id, ts = entry
if (time.time() - ts) >= _RESPONSES_PREV_ID_TTL:
_RESPONSES_PREV_IDS.pop(key, None)
return ''
return response_id
def _cleanup_previous_response_ids_locked() -> None:
"""清理过期的 previous_response_id 缓存项。
这张表只用于短期多轮续接一旦对话长时间不活跃就不再需要继续保留
以免常驻进程运行过久后累计过多失效状态
"""
now = time.time()
expired = [
key for key, (_, ts) in _RESPONSES_PREV_IDS.items()
if (now - ts) >= _RESPONSES_PREV_ID_TTL
]
for key in expired:
_RESPONSES_PREV_IDS.pop(key, None)
def inject_instructions_anthropic(payload: dict[str, Any], instructions: str, position: str = 'prepend') -> dict[str, Any]: def inject_instructions_anthropic(payload: dict[str, Any], instructions: str, position: str = 'prepend') -> dict[str, Any]:
"""向 Anthropic Messages 请求注入自定义指令(写入 system 字段)。 """向 Anthropic Messages 请求注入自定义指令(写入 system 字段)。

View file

@ -28,6 +28,8 @@ from routes.common import (
build_responses_target, build_responses_target,
build_route_context, build_route_context,
ensure_responses_cache_control, ensure_responses_cache_control,
attach_previous_response_id,
remember_response_id,
inject_instructions_anthropic, inject_instructions_anthropic,
inject_instructions_cc, inject_instructions_cc,
inject_instructions_responses, inject_instructions_responses,
@ -249,6 +251,7 @@ def _handle_responses_backend(ctx: RouteContext, payload: dict[str, Any], turn:
payload['model'] = ctx.upstream_model payload['model'] = ctx.upstream_model
payload = inject_instructions_responses(payload, ctx.custom_instructions, ctx.instructions_position) payload = inject_instructions_responses(payload, ctx.custom_instructions, ctx.instructions_position)
payload = ensure_responses_cache_control(payload) payload = ensure_responses_cache_control(payload)
payload = attach_previous_response_id(payload)
url, headers = build_responses_target(ctx) url, headers = build_responses_target(ctx)
payload = apply_body_modifications(payload, ctx.body_modifications) payload = apply_body_modifications(payload, ctx.body_modifications)
headers = apply_header_modifications(headers, ctx.header_modifications) headers = apply_header_modifications(headers, ctx.header_modifications)
@ -276,6 +279,7 @@ def _handle_responses_non_stream(
response_data = resp.json() response_data = resp.json()
attach_upstream_response(turn, response_data) attach_upstream_response(turn, response_data)
remember_response_id(payload, response_data)
response_data['model'] = ctx.client_model response_data['model'] = ctx.client_model
return _finalize_responses_response( return _finalize_responses_response(
response_data, response_data,
@ -315,6 +319,10 @@ def _handle_responses_stream(
extracted_usage = _extract_responses_usage(event_data) extracted_usage = _extract_responses_usage(event_data)
if extracted_usage: if extracted_usage:
last_usage = extracted_usage last_usage = extracted_usage
if event_type == 'response.completed':
response_obj = event_data.get('response') if isinstance(event_data, dict) else None
if isinstance(response_obj, dict):
remember_response_id(payload, response_obj)
if event_count < 10: if event_count < 10:
_dbg( _dbg(
f'上游事件#{event_count} 类型={event_type} 数据=' f'上游事件#{event_count} 类型={event_type} 数据='