修复缓存命中bug
This commit is contained in:
parent
98f3ae24a0
commit
72223ef412
3 changed files with 178 additions and 0 deletions
|
|
@ -43,6 +43,8 @@ from routes.common import (
|
||||||
build_route_context,
|
build_route_context,
|
||||||
chat_error_chunk,
|
chat_error_chunk,
|
||||||
ensure_responses_cache_control,
|
ensure_responses_cache_control,
|
||||||
|
attach_previous_response_id,
|
||||||
|
remember_response_id,
|
||||||
inject_instructions_anthropic,
|
inject_instructions_anthropic,
|
||||||
inject_instructions_cc,
|
inject_instructions_cc,
|
||||||
inject_instructions_responses,
|
inject_instructions_responses,
|
||||||
|
|
@ -313,6 +315,7 @@ def _handle_responses_backend(ctx: RouteContext, payload: dict[str, Any], turn:
|
||||||
responses_payload['model'] = ctx.upstream_model
|
responses_payload['model'] = ctx.upstream_model
|
||||||
responses_payload = inject_instructions_responses(responses_payload, ctx.custom_instructions, ctx.instructions_position)
|
responses_payload = inject_instructions_responses(responses_payload, ctx.custom_instructions, ctx.instructions_position)
|
||||||
responses_payload = ensure_responses_cache_control(responses_payload)
|
responses_payload = ensure_responses_cache_control(responses_payload)
|
||||||
|
responses_payload = attach_previous_response_id(responses_payload)
|
||||||
_dbg(
|
_dbg(
|
||||||
'已转换为 Responses 请求:字段=' + str(list(responses_payload.keys()))
|
'已转换为 Responses 请求:字段=' + str(list(responses_payload.keys()))
|
||||||
+ f' 输入项数={len(responses_payload.get("input", []))}'
|
+ f' 输入项数={len(responses_payload.get("input", []))}'
|
||||||
|
|
@ -347,6 +350,7 @@ def _handle_responses_non_stream(
|
||||||
attach_upstream_response(turn, raw)
|
attach_upstream_response(turn, raw)
|
||||||
_dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000])
|
_dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000])
|
||||||
|
|
||||||
|
remember_response_id(payload, raw)
|
||||||
data = responses_to_cc_response(raw, ctx.client_model)
|
data = responses_to_cc_response(raw, ctx.client_model)
|
||||||
return _finalize_chat_response(ctx, data, turn=turn, debug_label='Responses 转回聊天补全后')
|
return _finalize_chat_response(ctx, data, turn=turn, debug_label='Responses 转回聊天补全后')
|
||||||
|
|
||||||
|
|
@ -385,6 +389,10 @@ def _handle_responses_stream(
|
||||||
'completion_tokens': extracted_usage.get('output_tokens', 0),
|
'completion_tokens': extracted_usage.get('output_tokens', 0),
|
||||||
'total_tokens': extracted_usage.get('total_tokens', 0),
|
'total_tokens': extracted_usage.get('total_tokens', 0),
|
||||||
}
|
}
|
||||||
|
if event_type == 'response.completed':
|
||||||
|
response_obj = event_data.get('response') if isinstance(event_data, dict) else None
|
||||||
|
if isinstance(response_obj, dict):
|
||||||
|
remember_response_id(payload, response_obj)
|
||||||
if event_count < 10:
|
if event_count < 10:
|
||||||
_dbg(
|
_dbg(
|
||||||
f'上游事件#{event_count} 类型={event_type} 数据='
|
f'上游事件#{event_count} 类型={event_type} 数据='
|
||||||
|
|
|
||||||
162
routes/common.py
162
routes/common.py
|
|
@ -7,8 +7,11 @@ SSE 消息拼装逻辑,避免 `chat.py` 和 `responses.py` 各自维护重复
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import settings
|
import settings
|
||||||
|
|
@ -16,6 +19,10 @@ from utils.http import build_anthropic_headers, build_gemini_headers, build_open
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_RESPONSES_PREV_ID_LOCK = threading.Lock()
|
||||||
|
_RESPONSES_PREV_ID_TTL = 86400
|
||||||
|
_RESPONSES_PREV_IDS: dict[str, tuple[str, float]] = {}
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class RouteContext:
|
class RouteContext:
|
||||||
|
|
@ -212,6 +219,161 @@ def ensure_responses_cache_control(payload: dict[str, Any]) -> dict[str, Any]:
|
||||||
return payload
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
def attach_previous_response_id(payload: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
"""为多轮 Responses 请求补齐上一轮 response_id。
|
||||||
|
|
||||||
|
某些上游在 `/v1/responses` 多轮场景下,只有沿用 `previous_response_id` 才能稳定复用
|
||||||
|
上一轮的服务端响应链与缓存。Cursor 通常会回传完整历史,但不会主动带这个字段,
|
||||||
|
因此代理需要基于稳定对话键做一次轻量补齐。
|
||||||
|
"""
|
||||||
|
if not isinstance(payload, dict) or payload.get('previous_response_id'):
|
||||||
|
return payload
|
||||||
|
key = _responses_prev_id_key(payload)
|
||||||
|
if not key:
|
||||||
|
return payload
|
||||||
|
previous_response_id = _get_previous_response_id(key)
|
||||||
|
if not previous_response_id:
|
||||||
|
return payload
|
||||||
|
payload['previous_response_id'] = previous_response_id
|
||||||
|
logger.info('已为 Responses 请求补齐 previous_response_id')
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
|
def remember_response_id(payload: dict[str, Any], response_data: dict[str, Any]) -> None:
|
||||||
|
"""记住当前对话最近一次上游 Responses response_id。"""
|
||||||
|
if not isinstance(payload, dict) or not isinstance(response_data, dict):
|
||||||
|
return
|
||||||
|
response_id = response_data.get('id')
|
||||||
|
if not isinstance(response_id, str) or not response_id.strip():
|
||||||
|
return
|
||||||
|
key = _responses_prev_id_key(payload)
|
||||||
|
if not key:
|
||||||
|
return
|
||||||
|
with _RESPONSES_PREV_ID_LOCK:
|
||||||
|
_RESPONSES_PREV_IDS[key] = (response_id.strip(), time.time())
|
||||||
|
_cleanup_previous_response_ids_locked()
|
||||||
|
|
||||||
|
|
||||||
|
def _responses_prev_id_key(payload: dict[str, Any]) -> str:
|
||||||
|
"""基于 Responses 请求的“对话根信息”生成稳定键。
|
||||||
|
|
||||||
|
这里故意不直接使用完整 `input` 作为键,因为多轮对话每轮都会追加历史;
|
||||||
|
如果把整段历史都纳入哈希,键会在每一轮变化,导致无法稳定取回上一轮的
|
||||||
|
`previous_response_id`。当前策略只取 instructions 与首轮 user/assistant 根消息。
|
||||||
|
"""
|
||||||
|
instructions = payload.get('instructions') or ''
|
||||||
|
input_data = payload.get('input', [])
|
||||||
|
if isinstance(input_data, str):
|
||||||
|
seed_input = input_data
|
||||||
|
elif isinstance(input_data, list):
|
||||||
|
seed_input = _responses_root_seed_from_items(input_data)
|
||||||
|
else:
|
||||||
|
seed_input = json.dumps(input_data, ensure_ascii=False, default=str)
|
||||||
|
raw = instructions + '|' + seed_input
|
||||||
|
if not raw.strip('|'):
|
||||||
|
return ''
|
||||||
|
return hashlib.sha256(raw.encode('utf-8')).hexdigest()[:24]
|
||||||
|
|
||||||
|
|
||||||
|
def _responses_root_seed_from_items(items: list[Any]) -> str:
|
||||||
|
"""从 Responses `input` 中提取足够稳定的对话根片段。
|
||||||
|
|
||||||
|
目标不是完整还原会话,而是构造一个在同一段对话内尽量恒定、跨轮次可复用的
|
||||||
|
seed。这里沿用项目里 conversation seed 的思路:优先取第一条 user 与第一条
|
||||||
|
assistant;如果 assistant 还不存在,则只用第一条 user。
|
||||||
|
"""
|
||||||
|
first_user = None
|
||||||
|
first_assistant = None
|
||||||
|
for item in items:
|
||||||
|
if isinstance(item, str):
|
||||||
|
if first_user is None:
|
||||||
|
first_user = {'role': 'user', 'content': item}
|
||||||
|
continue
|
||||||
|
if not isinstance(item, dict):
|
||||||
|
continue
|
||||||
|
item_type = item.get('type', '')
|
||||||
|
role = item.get('role', '')
|
||||||
|
if item_type == 'message' and role in ('user', 'assistant'):
|
||||||
|
normalized = {
|
||||||
|
'role': role,
|
||||||
|
'content': _responses_normalize_content(item.get('content', [])),
|
||||||
|
}
|
||||||
|
if role == 'user' and first_user is None:
|
||||||
|
first_user = normalized
|
||||||
|
elif role == 'assistant' and first_assistant is None:
|
||||||
|
first_assistant = normalized
|
||||||
|
elif role in ('user', 'assistant') and not item_type:
|
||||||
|
normalized = {
|
||||||
|
'role': role,
|
||||||
|
'content': _responses_normalize_content(item.get('content', '')),
|
||||||
|
}
|
||||||
|
if role == 'user' and first_user is None:
|
||||||
|
first_user = normalized
|
||||||
|
elif role == 'assistant' and first_assistant is None:
|
||||||
|
first_assistant = normalized
|
||||||
|
if first_user is not None and first_assistant is not None:
|
||||||
|
break
|
||||||
|
parts = []
|
||||||
|
if first_user is not None:
|
||||||
|
parts.append(first_user)
|
||||||
|
if first_assistant is not None:
|
||||||
|
parts.append(first_assistant)
|
||||||
|
return json.dumps(parts, ensure_ascii=False, separators=(',', ':'))
|
||||||
|
|
||||||
|
|
||||||
|
def _responses_normalize_content(content: Any) -> str:
|
||||||
|
"""把 Responses 各种 content 形态折叠成稳定文本。
|
||||||
|
|
||||||
|
这里的目标不是保真展示,而是降低结构差异对 key 计算的影响;只抽取会影响
|
||||||
|
会话根语义的文本型内容,忽略无关字段,避免同一轮请求因格式细节不同而得到
|
||||||
|
不同的 previous_response_id 键。
|
||||||
|
"""
|
||||||
|
if isinstance(content, str):
|
||||||
|
return content.strip()
|
||||||
|
if not isinstance(content, list):
|
||||||
|
return str(content).strip() if content is not None else ''
|
||||||
|
texts: list[str] = []
|
||||||
|
for part in content:
|
||||||
|
if isinstance(part, str):
|
||||||
|
texts.append(part)
|
||||||
|
continue
|
||||||
|
if not isinstance(part, dict):
|
||||||
|
continue
|
||||||
|
if part.get('type') in ('input_text', 'output_text', 'text'):
|
||||||
|
texts.append(part.get('text', ''))
|
||||||
|
elif part.get('type') == 'summary_text':
|
||||||
|
texts.append(part.get('text', ''))
|
||||||
|
return '\n'.join(texts).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_previous_response_id(key: str) -> str:
|
||||||
|
"""按稳定键读取上一轮 response_id,并在过期时顺手清理。"""
|
||||||
|
with _RESPONSES_PREV_ID_LOCK:
|
||||||
|
entry = _RESPONSES_PREV_IDS.get(key)
|
||||||
|
if not entry:
|
||||||
|
return ''
|
||||||
|
response_id, ts = entry
|
||||||
|
if (time.time() - ts) >= _RESPONSES_PREV_ID_TTL:
|
||||||
|
_RESPONSES_PREV_IDS.pop(key, None)
|
||||||
|
return ''
|
||||||
|
return response_id
|
||||||
|
|
||||||
|
|
||||||
|
def _cleanup_previous_response_ids_locked() -> None:
|
||||||
|
"""清理过期的 previous_response_id 缓存项。
|
||||||
|
|
||||||
|
这张表只用于短期多轮续接;一旦对话长时间不活跃,就不再需要继续保留,
|
||||||
|
以免常驻进程运行过久后累计过多失效状态。
|
||||||
|
"""
|
||||||
|
now = time.time()
|
||||||
|
expired = [
|
||||||
|
key for key, (_, ts) in _RESPONSES_PREV_IDS.items()
|
||||||
|
if (now - ts) >= _RESPONSES_PREV_ID_TTL
|
||||||
|
]
|
||||||
|
for key in expired:
|
||||||
|
_RESPONSES_PREV_IDS.pop(key, None)
|
||||||
|
|
||||||
|
|
||||||
def inject_instructions_anthropic(payload: dict[str, Any], instructions: str, position: str = 'prepend') -> dict[str, Any]:
|
def inject_instructions_anthropic(payload: dict[str, Any], instructions: str, position: str = 'prepend') -> dict[str, Any]:
|
||||||
"""向 Anthropic Messages 请求注入自定义指令(写入 system 字段)。
|
"""向 Anthropic Messages 请求注入自定义指令(写入 system 字段)。
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,8 @@ from routes.common import (
|
||||||
build_responses_target,
|
build_responses_target,
|
||||||
build_route_context,
|
build_route_context,
|
||||||
ensure_responses_cache_control,
|
ensure_responses_cache_control,
|
||||||
|
attach_previous_response_id,
|
||||||
|
remember_response_id,
|
||||||
inject_instructions_anthropic,
|
inject_instructions_anthropic,
|
||||||
inject_instructions_cc,
|
inject_instructions_cc,
|
||||||
inject_instructions_responses,
|
inject_instructions_responses,
|
||||||
|
|
@ -249,6 +251,7 @@ def _handle_responses_backend(ctx: RouteContext, payload: dict[str, Any], turn:
|
||||||
payload['model'] = ctx.upstream_model
|
payload['model'] = ctx.upstream_model
|
||||||
payload = inject_instructions_responses(payload, ctx.custom_instructions, ctx.instructions_position)
|
payload = inject_instructions_responses(payload, ctx.custom_instructions, ctx.instructions_position)
|
||||||
payload = ensure_responses_cache_control(payload)
|
payload = ensure_responses_cache_control(payload)
|
||||||
|
payload = attach_previous_response_id(payload)
|
||||||
url, headers = build_responses_target(ctx)
|
url, headers = build_responses_target(ctx)
|
||||||
payload = apply_body_modifications(payload, ctx.body_modifications)
|
payload = apply_body_modifications(payload, ctx.body_modifications)
|
||||||
headers = apply_header_modifications(headers, ctx.header_modifications)
|
headers = apply_header_modifications(headers, ctx.header_modifications)
|
||||||
|
|
@ -276,6 +279,7 @@ def _handle_responses_non_stream(
|
||||||
|
|
||||||
response_data = resp.json()
|
response_data = resp.json()
|
||||||
attach_upstream_response(turn, response_data)
|
attach_upstream_response(turn, response_data)
|
||||||
|
remember_response_id(payload, response_data)
|
||||||
response_data['model'] = ctx.client_model
|
response_data['model'] = ctx.client_model
|
||||||
return _finalize_responses_response(
|
return _finalize_responses_response(
|
||||||
response_data,
|
response_data,
|
||||||
|
|
@ -315,6 +319,10 @@ def _handle_responses_stream(
|
||||||
extracted_usage = _extract_responses_usage(event_data)
|
extracted_usage = _extract_responses_usage(event_data)
|
||||||
if extracted_usage:
|
if extracted_usage:
|
||||||
last_usage = extracted_usage
|
last_usage = extracted_usage
|
||||||
|
if event_type == 'response.completed':
|
||||||
|
response_obj = event_data.get('response') if isinstance(event_data, dict) else None
|
||||||
|
if isinstance(response_obj, dict):
|
||||||
|
remember_response_id(payload, response_obj)
|
||||||
if event_count < 10:
|
if event_count < 10:
|
||||||
_dbg(
|
_dbg(
|
||||||
f'上游事件#{event_count} 类型={event_type} 数据='
|
f'上游事件#{event_count} 类型={event_type} 数据='
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue