diff --git a/README.md b/README.md index 6dcea52..b7b6d4c 100644 --- a/README.md +++ b/README.md @@ -159,18 +159,6 @@ api2cursor/ - `file_path` → `path` 字段映射 - `finish_reason` 修正 - -============================ -增加缓存,在api2cursor里面的body修改中加个你喜欢的随意字段: -{ - "prompt_cache_key": "GPT5-4-xxx-xxx" -} -openai 开 fast 模式 -{ - "service_tier": "priority" -} - - ## 许可证 [MIT](LICENSE) diff --git a/adapters/cc_anthropic_adapter.py b/adapters/cc_anthropic_adapter.py index 95aced6..512b3de 100644 --- a/adapters/cc_anthropic_adapter.py +++ b/adapters/cc_anthropic_adapter.py @@ -261,12 +261,6 @@ def _convert_request_message(message: Any) -> tuple[JsonDict | None, str | None] anthropic_role = 'assistant' if role == 'assistant' else 'user' anthropic_content = _convert_content(message) - if role == 'assistant' and message.get('reasoning_content'): - thinking_block = {'type': 'thinking', 'thinking': message['reasoning_content']} - blocks = _to_blocks(anthropic_content) - blocks.insert(0, thinking_block) - anthropic_content = blocks - if role == 'assistant' and 'tool_calls' in message: anthropic_content = _append_tool_use_blocks(anthropic_content, message.get('tool_calls', [])) @@ -469,8 +463,6 @@ def _convert_content_part(part: Any) -> JsonDict | None: return {'type': 'text', 'text': part.get('text', '')} if part_type == 'image_url': return _convert_image(part) - if part_type == 'image': - return part if part_type in ('tool_use', 'tool_result'): return part return None @@ -582,16 +574,38 @@ _EPHEMERAL = {'type': 'ephemeral'} def optimize_cache_control(request: JsonDict) -> None: - """为 Anthropic Messages 请求启用顶层自动 prompt caching。 + """自动设置最优的 Anthropic cache_control 断点。 - 2026 版 Claude API 已支持在请求顶层使用 `cache_control` 开启自动缓存, - 由上游自动把断点放到最后一个可缓存块并随多轮对话前移。相比手动在嵌套 - content blocks 上打断点,这种方式对 Anthropic 兼容中转站更稳定,也更接近 - `/v1/responses` 通过顶层字段启用缓存的思路。 + 算法移植自 CursorProxy 的 ensure_cache_control.go: + 1. 归一化所有消息 content 为数组格式 + 2. 清空所有已有 cache_control + 3. 注入结构锚点(tools 末尾 + system 末尾) + 4. 注入消息锚点(最后一个可缓存块 + 窗口边界) + 5. 总断点数不超过 4 个 """ _normalize_message_contents(request) _clear_all_cache_controls(request) - request['cache_control'] = dict(_EPHEMERAL) + + structural = _inject_structural_anchors(request) + remaining = _MAX_BREAKPOINTS - structural + if remaining <= 0: + return + + refs = _collect_cacheable_block_refs(request) + if not refs: + return + + desired = 1 if len(refs) < _BLOCK_WINDOW else 2 + anchors = min(desired, remaining) + + if anchors >= 1 and refs: + refs[-1]['cache_control'] = _EPHEMERAL + + if anchors >= 2 and len(refs) > 1: + target = len(refs) - _BLOCK_WINDOW + idx = _pick_window_anchor(refs, target) + if idx is not None and idx != len(refs) - 1: + refs[idx]['cache_control'] = _EPHEMERAL def _normalize_message_contents(request: JsonDict) -> None: @@ -606,7 +620,6 @@ def _normalize_message_contents(request: JsonDict) -> None: def _clear_all_cache_controls(request: JsonDict) -> None: """清空所有已有的 cache_control 字段。""" - request.pop('cache_control', None) for tool in request.get('tools', []): tool.pop('cache_control', None) diff --git a/adapters/responses_cc_adapter.py b/adapters/responses_cc_adapter.py index e6c864a..b793dbd 100644 --- a/adapters/responses_cc_adapter.py +++ b/adapters/responses_cc_adapter.py @@ -654,6 +654,10 @@ class ResponsesToCCStreamConverter: 'completion_tokens': self._usage.get('output_tokens', 0), 'total_tokens': self._usage.get('total_tokens', 0), } + if isinstance(self._usage.get('input_tokens_details'), dict): + chunk['usage']['prompt_tokens_details'] = dict(self._usage['input_tokens_details']) + if isinstance(self._usage.get('output_tokens_details'), dict): + chunk['usage']['completion_tokens_details'] = dict(self._usage['output_tokens_details']) return [chunk] def _make_chunk(self, delta: JsonDict, finish_reason: str | None = None) -> JsonDict: @@ -678,20 +682,44 @@ def _copy_request_options(payload: JsonDict, result: JsonDict) -> None: """将 Responses 请求中的通用选项复制到 CC 请求体。""" if 'tools' in payload: result['tools'] = _convert_tools(payload['tools']) - for key in ('temperature', 'top_p'): + for key in ( + 'temperature', + 'top_p', + 'tool_choice', + 'parallel_tool_calls', + 'truncation', + 'store', + 'metadata', + 'conversation', + 'previous_response_id', + 'prompt_cache_key', + 'service_tier', + 'user', + ): if key in payload: result[key] = payload[key] if 'max_output_tokens' in payload: result['max_tokens'] = payload['max_output_tokens'] - if 'tool_choice' in payload: - result['tool_choice'] = payload['tool_choice'] def _copy_responses_request_options(payload: JsonDict, result: JsonDict) -> None: """将聊天补全请求中的通用选项复制到原生 Responses 请求体。""" if 'tools' in payload: result['tools'] = _convert_cc_tools_to_responses(payload['tools']) - for key in ('temperature', 'top_p', 'tool_choice'): + for key in ( + 'temperature', + 'top_p', + 'tool_choice', + 'parallel_tool_calls', + 'truncation', + 'store', + 'metadata', + 'conversation', + 'previous_response_id', + 'prompt_cache_key', + 'service_tier', + 'user', + ): if key in payload: result[key] = payload[key] if 'max_tokens' in payload: @@ -703,11 +731,7 @@ def _append_responses_input_item( instructions: list[str], input_items: list[JsonDict], ) -> None: - """将单条 Chat Completions 消息追加为 Responses `input` 项。 - - 尽量使用 EasyInputMessage 格式({role, content})以减少 token 开销, - 提高上游 prompt caching 的前缀匹配命中率。 - """ + """将单条 Chat Completions 消息追加为 Responses `input` 项。""" if not isinstance(message, dict): return @@ -728,26 +752,21 @@ def _append_responses_input_item( }) return - text = _content_to_text(content) - has_tool_calls = bool(message.get('tool_calls')) + item: JsonDict = { + 'type': 'message', + 'role': role or 'user', + 'content': _content_to_responses_parts(content, role), + } + input_items.append(item) - if role == 'assistant' and has_tool_calls: - if text: - input_items.append({ - 'type': 'message', - 'role': 'assistant', - 'content': [{'type': 'output_text', 'text': text}], - }) + if role == 'assistant': for tool_call in message.get('tool_calls') or []: input_items.append(_build_responses_function_call_item(tool_call)) - else: - input_items.append({'role': role or 'user', 'content': text or ''}) def _convert_input_items(items: list[Any], messages: list[JsonDict]) -> None: """将 Responses `input` 数组重建为 Chat Completions `messages` 列表。""" index = 0 - pending_reasoning: str | None = None while index < len(items): item = items[index] @@ -763,35 +782,20 @@ def _convert_input_items(items: list[Any], messages: list[JsonDict]) -> None: item_type = item.get('type', '') role = item.get('role', '') - if item_type == 'reasoning': - pending_reasoning = _extract_reasoning_text(item) - index += 1 - continue - if role and not item_type: - msg: JsonDict = { + messages.append({ 'role': role, 'content': _normalize_simple_content(item.get('content', '')), - } - if role == 'assistant' and pending_reasoning: - msg['reasoning_content'] = pending_reasoning - pending_reasoning = None - messages.append(msg) + }) index += 1 continue if item_type == 'message': consumed = _append_message_item(items, start=index, messages=messages) - if item.get('role') == 'assistant' and pending_reasoning and messages: - messages[-1]['reasoning_content'] = pending_reasoning - pending_reasoning = None index += consumed continue if item_type == 'function_call': - if pending_reasoning and messages and messages[-1].get('role') == 'assistant': - messages[-1]['reasoning_content'] = pending_reasoning - pending_reasoning = None _append_function_call_item(item, messages) index += 1 continue @@ -938,11 +942,18 @@ def _make_function_call_output_item(tool_call: JsonDict) -> JsonDict: def _build_responses_usage(usage: JsonDict) -> JsonDict: """将 Chat Completions 的 usage 字段映射为 Responses usage 结构。""" - return { + result = { 'input_tokens': usage.get('prompt_tokens', 0), 'output_tokens': usage.get('completion_tokens', 0), 'total_tokens': usage.get('total_tokens', 0), } + prompt_details = usage.get('prompt_tokens_details') + if isinstance(prompt_details, dict): + result['input_tokens_details'] = dict(prompt_details) + completion_details = usage.get('completion_tokens_details') + if isinstance(completion_details, dict): + result['output_tokens_details'] = dict(completion_details) + return result def _collect_cc_parts_from_responses_output(output_items: Any) -> tuple[str, str, list[JsonDict]]: diff --git a/docker-compose.yml b/compose.yml similarity index 92% rename from docker-compose.yml rename to compose.yml index 3d657de..944b836 100644 --- a/docker-compose.yml +++ b/compose.yml @@ -1,7 +1,6 @@ services: api2cursor: build: . - container_name: api2cursor ports: - "${PROXY_PORT:-3029}:${PROXY_PORT:-3029}" env_file: diff --git a/routes/admin.py b/routes/admin.py index 612ee89..e8a9e77 100644 --- a/routes/admin.py +++ b/routes/admin.py @@ -13,7 +13,6 @@ from flask import Blueprint, request, jsonify, send_from_directory import settings from config import Config -from utils.request_history import request_history logger = logging.getLogger(__name__) @@ -203,15 +202,6 @@ def get_stats(): return jsonify(usage_tracker.get_stats()) -@bp.route('/api/admin/request-logs', methods=['GET']) -def get_request_logs(): - """返回最近 500 条请求日志。""" - err = _check_auth() - if err: - return err - return jsonify({'items': request_history.get_recent(500)}) - - # ─── 内部辅助 ───────────────────────────────────── diff --git a/routes/chat.py b/routes/chat.py index 9532ed0..8bea531 100644 --- a/routes/chat.py +++ b/routes/chat.py @@ -9,7 +9,6 @@ from __future__ import annotations import json import logging -from time import perf_counter from typing import Any import settings @@ -43,7 +42,9 @@ from routes.common import ( build_responses_target, build_route_context, chat_error_chunk, - ensure_prompt_cache_key, + ensure_responses_cache_control, + attach_previous_response_id, + remember_response_id, inject_instructions_anthropic, inject_instructions_cc, inject_instructions_responses, @@ -60,7 +61,6 @@ from utils.http import ( iter_responses_sse, sse_response, ) -from utils.request_history import request_history from utils.request_logger import ( append_client_event, append_upstream_event, @@ -115,7 +115,6 @@ def chat_completions(): client_model = payload.get('model', 'unknown') is_stream = payload.get('stream', False) ctx = build_route_context(client_model, is_stream) - request_started_at = perf_counter() turn = start_turn( route='chat', client_model=client_model, @@ -131,16 +130,15 @@ def chat_completions(): log_route_context('聊天补全', ctx, extra=f'消息数={message_count}') _log_messages(payload) - if ctx.backend != 'responses': - payload['messages'] = thinking_cache.inject(payload.get('messages', [])) + payload['messages'] = thinking_cache.inject(payload.get('messages', [])) if ctx.backend == 'openai': - return _handle_openai_backend(ctx, payload, turn, request_started_at) + return _handle_openai_backend(ctx, payload, turn) if ctx.backend == 'responses': - return _handle_responses_backend(ctx, payload, turn, request_started_at) + return _handle_responses_backend(ctx, payload, turn) if ctx.backend == 'gemini': - return _handle_gemini_backend(ctx, payload, turn, request_started_at) - return _handle_anthropic_backend(ctx, payload, turn, request_started_at) + return _handle_gemini_backend(ctx, payload, turn) + return _handle_anthropic_backend(ctx, payload, turn) def _normalize_chat_payload(payload: dict[str, Any]) -> tuple[dict[str, Any], int]: @@ -161,12 +159,7 @@ def _normalize_chat_payload(payload: dict[str, Any]) -> tuple[dict[str, Any], in return payload, message_count -def _handle_openai_backend( - ctx: RouteContext, - payload: dict[str, Any], - turn: dict[str, Any], - request_started_at: float, -): +def _handle_openai_backend(ctx: RouteContext, payload: dict[str, Any], turn: dict[str, Any]): """处理走 OpenAI 兼容后端的聊天补全请求。""" _dbg( '原始请求字段=' + str(list(payload.keys())) + ' ' @@ -190,8 +183,8 @@ def _handle_openai_backend( headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_openai_stream(ctx, payload, url, headers, turn, request_started_at) - return _handle_openai_non_stream(ctx, payload, url, headers, turn, request_started_at) + return _handle_openai_stream(ctx, payload, url, headers, turn) + return _handle_openai_non_stream(ctx, payload, url, headers, turn) def _handle_openai_non_stream( @@ -200,7 +193,6 @@ def _handle_openai_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any], - request_started_at: float, ): """处理 OpenAI 兼容后端的非流式返回。""" payload['stream'] = False @@ -216,14 +208,7 @@ def _handle_openai_non_stream( _dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000]) data = fix_response(raw) - return _finalize_chat_response( - ctx, - data, - turn=turn, - debug_label='修复后响应', - request_started_at=request_started_at, - upstream_url=url, - ) + return _finalize_chat_response(ctx, data, turn=turn, debug_label='修复后响应') def _handle_openai_stream( @@ -232,7 +217,6 @@ def _handle_openai_stream( url: str, headers: dict[str, str], turn: dict[str, Any], - request_started_at: float, ): """处理 OpenAI 兼容后端的流式返回。""" payload['stream'] = True @@ -275,18 +259,7 @@ def _handle_openai_stream( 'chunk_count': len(client_chunks), 'usage': last_usage, }) - duration_ms = int((perf_counter() - request_started_at) * 1000) - request_history.record( - route='chat', - client_model=ctx.client_model, - actual_model=ctx.upstream_model, - backend=ctx.backend, - upstream_url=url, - usage=last_usage, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) - finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) + finalize_turn(turn, usage=last_usage) return append_upstream_event(turn, {'type': 'openai_chunk', 'data': chunk}) @@ -327,28 +300,12 @@ def _handle_openai_stream( 'chunk_count': len(client_chunks), 'usage': last_usage, }) - duration_ms = int((perf_counter() - request_started_at) * 1000) - request_history.record( - route='chat', - client_model=ctx.client_model, - actual_model=ctx.upstream_model, - backend=ctx.backend, - upstream_url=url, - usage=last_usage, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) - finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) + finalize_turn(turn, usage=last_usage) return sse_response(generate()) -def _handle_responses_backend( - ctx: RouteContext, - payload: dict[str, Any], - turn: dict[str, Any] | None, - request_started_at: float, -): +def _handle_responses_backend(ctx: RouteContext, payload: dict[str, Any], turn: dict[str, Any] | None): """处理走原生 Responses 后端的聊天补全请求。 当上游只支持 `/v1/responses` 时,需要先把聊天补全请求转换为 Responses 请求, @@ -357,7 +314,8 @@ def _handle_responses_backend( responses_payload = cc_to_responses_request(payload) responses_payload['model'] = ctx.upstream_model responses_payload = inject_instructions_responses(responses_payload, ctx.custom_instructions, ctx.instructions_position) - responses_payload = ensure_prompt_cache_key(responses_payload) + responses_payload = ensure_responses_cache_control(responses_payload) + responses_payload = attach_previous_response_id(responses_payload) _dbg( '已转换为 Responses 请求:字段=' + str(list(responses_payload.keys())) + f' 输入项数={len(responses_payload.get("input", []))}' @@ -368,8 +326,8 @@ def _handle_responses_backend( headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_responses_stream(ctx, responses_payload, url, headers, turn, request_started_at) - return _handle_responses_non_stream(ctx, responses_payload, url, headers, turn, request_started_at) + return _handle_responses_stream(ctx, responses_payload, url, headers, turn) + return _handle_responses_non_stream(ctx, responses_payload, url, headers, turn) def _handle_responses_non_stream( @@ -378,7 +336,6 @@ def _handle_responses_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理原生 Responses 后端的非流式返回。""" payload['stream'] = False @@ -393,15 +350,9 @@ def _handle_responses_non_stream( attach_upstream_response(turn, raw) _dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000]) + remember_response_id(payload, raw) data = responses_to_cc_response(raw, ctx.client_model) - return _finalize_chat_response( - ctx, - data, - turn=turn, - debug_label='Responses 转回聊天补全后', - request_started_at=request_started_at, - upstream_url=url, - ) + return _finalize_chat_response(ctx, data, turn=turn, debug_label='Responses 转回聊天补全后') def _handle_responses_stream( @@ -410,7 +361,6 @@ def _handle_responses_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理原生 Responses 后端的流式返回。""" payload['stream'] = True @@ -439,6 +389,10 @@ def _handle_responses_stream( 'completion_tokens': extracted_usage.get('output_tokens', 0), 'total_tokens': extracted_usage.get('total_tokens', 0), } + if event_type == 'response.completed': + response_obj = event_data.get('response') if isinstance(event_data, dict) else None + if isinstance(response_obj, dict): + remember_response_id(payload, response_obj) if event_count < 10: _dbg( f'上游事件#{event_count} 类型={event_type} 数据=' @@ -474,28 +428,12 @@ def _handle_responses_stream( 'chunk_count': len(client_chunks), 'usage': last_usage, }) - duration_ms = int((perf_counter() - request_started_at) * 1000) - request_history.record( - route='chat', - client_model=ctx.client_model, - actual_model=ctx.upstream_model, - backend=ctx.backend, - upstream_url=url, - usage=last_usage, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) - finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) + finalize_turn(turn, usage=last_usage) return sse_response(generate()) -def _handle_gemini_backend( - ctx: RouteContext, - payload: dict[str, Any], - turn: dict[str, Any] | None, - request_started_at: float, -): +def _handle_gemini_backend(ctx: RouteContext, payload: dict[str, Any], turn: dict[str, Any] | None): """处理走 Gemini Contents 后端的聊天补全请求。""" payload = inject_instructions_cc(payload, ctx.custom_instructions, ctx.instructions_position) gemini_payload = cc_to_gemini_request(payload) @@ -509,8 +447,8 @@ def _handle_gemini_backend( headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_gemini_stream(ctx, gemini_payload, url, headers, turn, request_started_at) - return _handle_gemini_non_stream(ctx, gemini_payload, url, headers, turn, request_started_at) + return _handle_gemini_stream(ctx, gemini_payload, url, headers, turn) + return _handle_gemini_non_stream(ctx, gemini_payload, url, headers, turn) def _handle_gemini_non_stream( @@ -519,7 +457,6 @@ def _handle_gemini_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理 Gemini 后端的非流式返回。""" attach_upstream_request(turn, payload, headers) @@ -534,14 +471,7 @@ def _handle_gemini_non_stream( _dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000]) data = gemini_to_cc_response(raw) - return _finalize_chat_response( - ctx, - data, - turn=turn, - debug_label='Gemini 转回聊天补全后', - request_started_at=request_started_at, - upstream_url=url, - ) + return _finalize_chat_response(ctx, data, turn=turn, debug_label='Gemini 转回聊天补全后') def _handle_gemini_stream( @@ -550,7 +480,6 @@ def _handle_gemini_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理 Gemini 后端的流式返回。""" converter = GeminiStreamConverter() @@ -613,28 +542,12 @@ def _handle_gemini_stream( 'chunk_count': len(client_chunks), 'usage': last_usage, }) - duration_ms = int((perf_counter() - request_started_at) * 1000) - request_history.record( - route='chat', - client_model=ctx.client_model, - actual_model=ctx.upstream_model, - backend=ctx.backend, - upstream_url=url, - usage=last_usage, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) - finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) + finalize_turn(turn, usage=last_usage) return sse_response(generate()) -def _handle_anthropic_backend( - ctx: RouteContext, - payload: dict[str, Any], - turn: dict[str, Any] | None, - request_started_at: float, -): +def _handle_anthropic_backend(ctx: RouteContext, payload: dict[str, Any], turn: dict[str, Any] | None): """处理走 Anthropic Messages 后端的聊天补全请求。""" payload['model'] = ctx.upstream_model anthropic_payload = cc_to_messages_request(payload) @@ -649,8 +562,8 @@ def _handle_anthropic_backend( headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_anthropic_stream(ctx, anthropic_payload, url, headers, turn, request_started_at) - return _handle_anthropic_non_stream(ctx, anthropic_payload, url, headers, turn, request_started_at) + return _handle_anthropic_stream(ctx, anthropic_payload, url, headers, turn) + return _handle_anthropic_non_stream(ctx, anthropic_payload, url, headers, turn) def _handle_anthropic_non_stream( @@ -659,7 +572,6 @@ def _handle_anthropic_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理 Anthropic 后端的非流式返回。""" payload['stream'] = False @@ -675,14 +587,7 @@ def _handle_anthropic_non_stream( _dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000]) data = messages_to_cc_response(raw) - return _finalize_chat_response( - ctx, - data, - turn=turn, - debug_label='Messages 转回聊天补全后', - request_started_at=request_started_at, - upstream_url=url, - ) + return _finalize_chat_response(ctx, data, turn=turn, debug_label='Messages 转回聊天补全后') def _handle_anthropic_stream( @@ -691,7 +596,6 @@ def _handle_anthropic_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理 Anthropic 后端的流式返回。 @@ -776,18 +680,7 @@ def _handle_anthropic_stream( 'chunk_count': len(client_chunks), 'usage': last_usage, }) - duration_ms = int((perf_counter() - request_started_at) * 1000) - request_history.record( - route='chat', - client_model=ctx.client_model, - actual_model=ctx.upstream_model, - backend=ctx.backend, - upstream_url=url, - usage=last_usage, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) - finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) + finalize_turn(turn, usage=last_usage) return sse_response(generate()) @@ -798,8 +691,6 @@ def _finalize_chat_response( *, turn: dict[str, Any] | None, debug_label: str, - request_started_at: float, - upstream_url: str, ): """统一收尾非流式聊天补全响应。 @@ -812,21 +703,9 @@ def _finalize_chat_response( _dbg(debug_label + '=' + json.dumps(data, ensure_ascii=False, default=str)[:1000]) log_usage('聊天补全', data.get('usage', {}), input_key='prompt_tokens', output_key='completion_tokens') - usage = data.get('usage') - duration_ms = int((perf_counter() - request_started_at) * 1000) - usage_tracker.record(ctx.client_model, usage) - request_history.record( - route='chat', - client_model=ctx.client_model, - actual_model=ctx.upstream_model, - backend=ctx.backend, - upstream_url=upstream_url, - usage=usage, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) + usage_tracker.record(ctx.client_model, data.get('usage')) attach_client_response(turn, data) - finalize_turn(turn, usage=usage, duration_ms=duration_ms) + finalize_turn(turn, usage=data.get('usage')) for choice in data.get('choices', []): msg = choice.get('message', {}) diff --git a/routes/common.py b/routes/common.py index 9f57233..f008b96 100644 --- a/routes/common.py +++ b/routes/common.py @@ -10,6 +10,8 @@ from dataclasses import dataclass import hashlib import json import logging +import threading +import time from typing import Any import settings @@ -17,6 +19,10 @@ from utils.http import build_anthropic_headers, build_gemini_headers, build_open logger = logging.getLogger(__name__) +_RESPONSES_PREV_ID_LOCK = threading.Lock() +_RESPONSES_PREV_ID_TTL = 86400 +_RESPONSES_PREV_IDS: dict[str, tuple[str, float]] = {} + @dataclass(frozen=True) class RouteContext: @@ -196,6 +202,178 @@ def inject_instructions_responses(payload: dict[str, Any], instructions: str, po return payload +def ensure_responses_cache_control(payload: dict[str, Any]) -> dict[str, Any]: + """为 Responses 请求补齐自动 prompt caching 开关。 + + 一些支持 `/v1/responses` 的上游会参考顶层 `cache_control` 来自动放置缓存断点。 + Cursor 侧通常不会主动携带这个字段,因此这里在缺失时补一个保守的默认值, + 同时允许调用方通过 body_modifications 或显式字段自行覆盖/关闭。 + """ + if not isinstance(payload, dict): + return payload + cache_control = payload.get('cache_control') + if isinstance(cache_control, dict) and cache_control.get('type'): + return payload + payload['cache_control'] = {'type': 'ephemeral'} + logger.info('已为 Responses 请求自动启用 cache_control=ephemeral') + return payload + + +def attach_previous_response_id(payload: dict[str, Any]) -> dict[str, Any]: + """为多轮 Responses 请求补齐上一轮 response_id。 + + 某些上游在 `/v1/responses` 多轮场景下,只有沿用 `previous_response_id` 才能稳定复用 + 上一轮的服务端响应链与缓存。Cursor 通常会回传完整历史,但不会主动带这个字段, + 因此代理需要基于稳定对话键做一次轻量补齐。 + """ + if not isinstance(payload, dict) or payload.get('previous_response_id'): + return payload + key = _responses_prev_id_key(payload) + if not key: + return payload + previous_response_id = _get_previous_response_id(key) + if not previous_response_id: + return payload + payload['previous_response_id'] = previous_response_id + logger.info('已为 Responses 请求补齐 previous_response_id') + return payload + + +def remember_response_id(payload: dict[str, Any], response_data: dict[str, Any]) -> None: + """记住当前对话最近一次上游 Responses response_id。""" + if not isinstance(payload, dict) or not isinstance(response_data, dict): + return + response_id = response_data.get('id') + if not isinstance(response_id, str) or not response_id.strip(): + return + key = _responses_prev_id_key(payload) + if not key: + return + with _RESPONSES_PREV_ID_LOCK: + _RESPONSES_PREV_IDS[key] = (response_id.strip(), time.time()) + _cleanup_previous_response_ids_locked() + + +def _responses_prev_id_key(payload: dict[str, Any]) -> str: + """基于 Responses 请求的“对话根信息”生成稳定键。 + + 这里故意不直接使用完整 `input` 作为键,因为多轮对话每轮都会追加历史; + 如果把整段历史都纳入哈希,键会在每一轮变化,导致无法稳定取回上一轮的 + `previous_response_id`。当前策略只取 instructions 与首轮 user/assistant 根消息。 + """ + instructions = payload.get('instructions') or '' + input_data = payload.get('input', []) + if isinstance(input_data, str): + seed_input = input_data + elif isinstance(input_data, list): + seed_input = _responses_root_seed_from_items(input_data) + else: + seed_input = json.dumps(input_data, ensure_ascii=False, default=str) + raw = instructions + '|' + seed_input + if not raw.strip('|'): + return '' + return hashlib.sha256(raw.encode('utf-8')).hexdigest()[:24] + + +def _responses_root_seed_from_items(items: list[Any]) -> str: + """从 Responses `input` 中提取足够稳定的对话根片段。 + + 目标不是完整还原会话,而是构造一个在同一段对话内尽量恒定、跨轮次可复用的 + seed。这里沿用项目里 conversation seed 的思路:优先取第一条 user 与第一条 + assistant;如果 assistant 还不存在,则只用第一条 user。 + """ + first_user = None + first_assistant = None + for item in items: + if isinstance(item, str): + if first_user is None: + first_user = {'role': 'user', 'content': item} + continue + if not isinstance(item, dict): + continue + item_type = item.get('type', '') + role = item.get('role', '') + if item_type == 'message' and role in ('user', 'assistant'): + normalized = { + 'role': role, + 'content': _responses_normalize_content(item.get('content', [])), + } + if role == 'user' and first_user is None: + first_user = normalized + elif role == 'assistant' and first_assistant is None: + first_assistant = normalized + elif role in ('user', 'assistant') and not item_type: + normalized = { + 'role': role, + 'content': _responses_normalize_content(item.get('content', '')), + } + if role == 'user' and first_user is None: + first_user = normalized + elif role == 'assistant' and first_assistant is None: + first_assistant = normalized + if first_user is not None and first_assistant is not None: + break + parts = [] + if first_user is not None: + parts.append(first_user) + if first_assistant is not None: + parts.append(first_assistant) + return json.dumps(parts, ensure_ascii=False, separators=(',', ':')) + + +def _responses_normalize_content(content: Any) -> str: + """把 Responses 各种 content 形态折叠成稳定文本。 + + 这里的目标不是保真展示,而是降低结构差异对 key 计算的影响;只抽取会影响 + 会话根语义的文本型内容,忽略无关字段,避免同一轮请求因格式细节不同而得到 + 不同的 previous_response_id 键。 + """ + if isinstance(content, str): + return content.strip() + if not isinstance(content, list): + return str(content).strip() if content is not None else '' + texts: list[str] = [] + for part in content: + if isinstance(part, str): + texts.append(part) + continue + if not isinstance(part, dict): + continue + if part.get('type') in ('input_text', 'output_text', 'text'): + texts.append(part.get('text', '')) + elif part.get('type') == 'summary_text': + texts.append(part.get('text', '')) + return '\n'.join(texts).strip() + + +def _get_previous_response_id(key: str) -> str: + """按稳定键读取上一轮 response_id,并在过期时顺手清理。""" + with _RESPONSES_PREV_ID_LOCK: + entry = _RESPONSES_PREV_IDS.get(key) + if not entry: + return '' + response_id, ts = entry + if (time.time() - ts) >= _RESPONSES_PREV_ID_TTL: + _RESPONSES_PREV_IDS.pop(key, None) + return '' + return response_id + + +def _cleanup_previous_response_ids_locked() -> None: + """清理过期的 previous_response_id 缓存项。 + + 这张表只用于短期多轮续接;一旦对话长时间不活跃,就不再需要继续保留, + 以免常驻进程运行过久后累计过多失效状态。 + """ + now = time.time() + expired = [ + key for key, (_, ts) in _RESPONSES_PREV_IDS.items() + if (now - ts) >= _RESPONSES_PREV_ID_TTL + ] + for key in expired: + _RESPONSES_PREV_IDS.pop(key, None) + + def inject_instructions_anthropic(payload: dict[str, Any], instructions: str, position: str = 'prepend') -> dict[str, Any]: """向 Anthropic Messages 请求注入自定义指令(写入 system 字段)。 @@ -219,23 +397,6 @@ def inject_instructions_anthropic(payload: dict[str, Any], instructions: str, po # ─── Body / Header 修改 ────────────────────────── -def ensure_prompt_cache_key(payload: dict[str, Any]) -> dict[str, Any]: - """确保 Responses 请求携带 prompt_cache_key 以启用上游提示缓存。 - - 上游(如 sub2api)对原生 /v1/responses 请求不会自动生成 prompt_cache_key, - 导致提示缓存无法命中。这里根据模型名 + instructions 生成稳定的 cache key, - 使得相同模型和系统提示的对话可以共享缓存前缀。 - """ - if payload.get('prompt_cache_key'): - return payload - - model = payload.get('model', '') - instructions = payload.get('instructions', '') - seed = f'{model}|{instructions}' - payload['prompt_cache_key'] = hashlib.sha256(seed.encode()).hexdigest()[:32] - return payload - - def apply_body_modifications(payload: dict[str, Any], modifications: dict[str, Any]) -> dict[str, Any]: """对转发请求体应用字段级修改。 diff --git a/routes/messages.py b/routes/messages.py index a320081..0d9faa5 100644 --- a/routes/messages.py +++ b/routes/messages.py @@ -7,7 +7,6 @@ Anthropic Messages API 透传。当 Cursor 直接发送 Anthropic 格式请求 import json import logging -from time import perf_counter import requests as req_lib from flask import Blueprint, request, jsonify @@ -16,7 +15,6 @@ import settings from config import Config from routes.common import apply_body_modifications, apply_header_modifications, inject_instructions_anthropic from utils.http import build_anthropic_headers, forward_request, sse_response -from utils.request_history import request_history from utils.request_logger import ( append_client_event, append_upstream_event, @@ -42,7 +40,6 @@ def messages_passthrough(): model = payload.get('model', 'unknown') is_stream = payload.get('stream', False) - request_started_at = perf_counter() logger.info(f'[透传] model={model} 流式={is_stream}') mapping = settings.resolve_model(model) @@ -81,18 +78,7 @@ def messages_passthrough(): attach_upstream_response(turn, data) _inject_thinking(data) attach_client_response(turn, data) - duration_ms = int((perf_counter() - request_started_at) * 1000) - request_history.record( - route='messages', - client_model=model, - actual_model=model, - backend='anthropic', - upstream_url=url, - usage=data.get('usage'), - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) - finalize_turn(turn, usage=data.get('usage'), duration_ms=duration_ms) + finalize_turn(turn) return jsonify(data) def generate(): @@ -122,18 +108,7 @@ def messages_passthrough(): 'type': 'messages.stream.summary', 'event_count': len(client_events), }) - duration_ms = int((perf_counter() - request_started_at) * 1000) - request_history.record( - route='messages', - client_model=model, - actual_model=model, - backend='anthropic', - upstream_url=url, - usage=None, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) - finalize_turn(turn, duration_ms=duration_ms) + finalize_turn(turn) except req_lib.RequestException as e: logger.error(f'请求上游失败: {e}') attach_error(turn, {'stage': 'request_exception', 'message': str(e)}) diff --git a/routes/responses.py b/routes/responses.py index 271c30f..dd32d5c 100644 --- a/routes/responses.py +++ b/routes/responses.py @@ -8,7 +8,6 @@ from __future__ import annotations import json import logging -from time import perf_counter from typing import Any import settings @@ -28,7 +27,9 @@ from routes.common import ( build_openai_target, build_responses_target, build_route_context, - ensure_prompt_cache_key, + ensure_responses_cache_control, + attach_previous_response_id, + remember_response_id, inject_instructions_anthropic, inject_instructions_cc, inject_instructions_responses, @@ -45,7 +46,6 @@ from utils.http import ( iter_responses_sse, sse_response, ) -from utils.request_history import request_history from utils.request_logger import ( append_client_event, append_upstream_event, @@ -80,7 +80,6 @@ def responses_endpoint(): client_model = payload.get('model', 'unknown') is_stream = payload.get('stream', False) - request_started_at = perf_counter() ctx = build_route_context(client_model, is_stream) turn = start_turn( route='responses', @@ -97,12 +96,12 @@ def responses_endpoint(): cc_payload = _build_cc_payload(payload, ctx) if ctx.backend == 'openai': - return _handle_openai_backend(ctx, cc_payload, turn, request_started_at) + return _handle_openai_backend(ctx, cc_payload, turn) if ctx.backend == 'responses': - return _handle_responses_backend(ctx, payload, turn, request_started_at) + return _handle_responses_backend(ctx, payload, turn) if ctx.backend == 'gemini': - return _handle_gemini_backend(ctx, cc_payload, turn, request_started_at) - return _handle_anthropic_backend(ctx, cc_payload, turn, request_started_at) + return _handle_gemini_backend(ctx, cc_payload, turn) + return _handle_anthropic_backend(ctx, cc_payload, turn) def _build_cc_payload(payload: dict[str, Any], ctx: RouteContext) -> dict[str, Any]: @@ -122,12 +121,7 @@ def _build_cc_payload(payload: dict[str, Any], ctx: RouteContext) -> dict[str, A return cc_payload -def _handle_openai_backend( - ctx: RouteContext, - cc_payload: dict[str, Any], - turn: dict[str, Any], - request_started_at: float, -): +def _handle_openai_backend(ctx: RouteContext, cc_payload: dict[str, Any], turn: dict[str, Any]): """处理走 OpenAI 兼容后端的 Responses 请求。""" cc_payload = normalize_request(cc_payload) _dbg( @@ -140,8 +134,8 @@ def _handle_openai_backend( headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_openai_stream(ctx, cc_payload, url, headers, turn, request_started_at) - return _handle_openai_non_stream(ctx, cc_payload, url, headers, turn, request_started_at) + return _handle_openai_stream(ctx, cc_payload, url, headers, turn) + return _handle_openai_non_stream(ctx, cc_payload, url, headers, turn) def _handle_openai_non_stream( @@ -150,7 +144,6 @@ def _handle_openai_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any], - request_started_at: float, ): """处理 OpenAI 兼容后端的非流式 Responses 返回。""" cc_payload['stream'] = False @@ -172,9 +165,6 @@ def _handle_openai_non_stream( client_model=ctx.client_model, turn=turn, debug_label='转换为 Responses 后', - ctx=ctx, - request_started_at=request_started_at, - upstream_url=url, ) @@ -184,7 +174,6 @@ def _handle_openai_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理 OpenAI 兼容后端的流式 Responses 返回。""" cc_payload['stream'] = True @@ -225,18 +214,7 @@ def _handle_openai_stream( 'model': ctx.client_model, 'event_count': len(client_events), }) - duration_ms = int((perf_counter() - request_started_at) * 1000) - request_history.record( - route='responses', - client_model=ctx.client_model, - actual_model=ctx.upstream_model, - backend=ctx.backend, - upstream_url=url, - usage=None, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) - finalize_turn(turn, duration_ms=duration_ms) + finalize_turn(turn) return append_upstream_event(turn, {'type': 'openai_chunk', 'data': chunk}) @@ -263,12 +241,7 @@ def _handle_openai_stream( return sse_response(generate()) -def _handle_responses_backend( - ctx: RouteContext, - payload: dict[str, Any], - turn: dict[str, Any] | None, - request_started_at: float, -): +def _handle_responses_backend(ctx: RouteContext, payload: dict[str, Any], turn: dict[str, Any] | None): """处理走原生 Responses 后端的请求。 当中转站本身就只支持 `/v1/responses` 时,不需要再绕到聊天补全中间协议, @@ -277,14 +250,15 @@ def _handle_responses_backend( payload = dict(payload) payload['model'] = ctx.upstream_model payload = inject_instructions_responses(payload, ctx.custom_instructions, ctx.instructions_position) - payload = ensure_prompt_cache_key(payload) + payload = ensure_responses_cache_control(payload) + payload = attach_previous_response_id(payload) url, headers = build_responses_target(ctx) payload = apply_body_modifications(payload, ctx.body_modifications) headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_responses_stream(ctx, payload, url, headers, turn, request_started_at) - return _handle_responses_non_stream(ctx, payload, url, headers, turn, request_started_at) + return _handle_responses_stream(ctx, payload, url, headers, turn) + return _handle_responses_non_stream(ctx, payload, url, headers, turn) def _handle_responses_non_stream( @@ -293,7 +267,6 @@ def _handle_responses_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理原生 Responses 后端的非流式返回。""" payload['stream'] = False @@ -306,15 +279,13 @@ def _handle_responses_non_stream( response_data = resp.json() attach_upstream_response(turn, response_data) + remember_response_id(payload, response_data) response_data['model'] = ctx.client_model return _finalize_responses_response( response_data, client_model=ctx.client_model, turn=turn, debug_label='原生 Responses 返回后', - ctx=ctx, - request_started_at=request_started_at, - upstream_url=url, ) @@ -324,7 +295,6 @@ def _handle_responses_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理原生 Responses 后端的流式返回。""" payload['stream'] = True @@ -349,6 +319,10 @@ def _handle_responses_stream( extracted_usage = _extract_responses_usage(event_data) if extracted_usage: last_usage = extracted_usage + if event_type == 'response.completed': + response_obj = event_data.get('response') if isinstance(event_data, dict) else None + if isinstance(response_obj, dict): + remember_response_id(payload, response_obj) if event_count < 10: _dbg( f'上游事件#{event_count} 类型={event_type} 数据=' @@ -379,18 +353,7 @@ def _handle_responses_stream( 'event_count': len(client_events), 'usage': last_usage, }) - duration_ms = int((perf_counter() - request_started_at) * 1000) - request_history.record( - route='responses', - client_model=ctx.client_model, - actual_model=ctx.upstream_model, - backend=ctx.backend, - upstream_url=url, - usage=last_usage, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) - finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) + finalize_turn(turn, usage=last_usage) return sse_response(generate()) @@ -414,12 +377,7 @@ def _extract_responses_usage(event_data: dict[str, Any]) -> dict[str, Any] | Non return None -def _handle_gemini_backend( - ctx: RouteContext, - cc_payload: dict[str, Any], - turn: dict[str, Any] | None, - request_started_at: float, -): +def _handle_gemini_backend(ctx: RouteContext, cc_payload: dict[str, Any], turn: dict[str, Any] | None): """处理走 Gemini Contents 后端的 Responses 请求。""" gemini_payload = cc_to_gemini_request(cc_payload) _dbg( @@ -432,8 +390,8 @@ def _handle_gemini_backend( headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_gemini_stream(ctx, gemini_payload, url, headers, turn, request_started_at) - return _handle_gemini_non_stream(ctx, gemini_payload, url, headers, turn, request_started_at) + return _handle_gemini_stream(ctx, gemini_payload, url, headers, turn) + return _handle_gemini_non_stream(ctx, gemini_payload, url, headers, turn) def _handle_gemini_non_stream( @@ -442,7 +400,6 @@ def _handle_gemini_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理 Gemini 后端的非流式 Responses 返回。""" attach_upstream_request(turn, payload, headers) @@ -463,9 +420,6 @@ def _handle_gemini_non_stream( client_model=ctx.client_model, turn=turn, debug_label='Gemini 转回 Responses 后', - ctx=ctx, - request_started_at=request_started_at, - upstream_url=url, ) @@ -475,7 +429,6 @@ def _handle_gemini_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理 Gemini 后端的流式 Responses 返回。""" converter = ResponsesStreamConverter(model=ctx.client_model) @@ -542,28 +495,12 @@ def _handle_gemini_stream( 'event_count': len(client_events), 'usage': last_usage, }) - duration_ms = int((perf_counter() - request_started_at) * 1000) - request_history.record( - route='responses', - client_model=ctx.client_model, - actual_model=ctx.upstream_model, - backend=ctx.backend, - upstream_url=url, - usage=last_usage, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) - finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) + finalize_turn(turn, usage=last_usage) return sse_response(generate()) -def _handle_anthropic_backend( - ctx: RouteContext, - cc_payload: dict[str, Any], - turn: dict[str, Any] | None, - request_started_at: float, -): +def _handle_anthropic_backend(ctx: RouteContext, cc_payload: dict[str, Any], turn: dict[str, Any] | None): """处理走 Anthropic 后端的 Responses 请求。""" anthropic_payload = cc_to_messages_request(cc_payload) _dbg( @@ -576,8 +513,8 @@ def _handle_anthropic_backend( headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_anthropic_stream(ctx, anthropic_payload, url, headers, turn, request_started_at) - return _handle_anthropic_non_stream(ctx, anthropic_payload, url, headers, turn, request_started_at) + return _handle_anthropic_stream(ctx, anthropic_payload, url, headers, turn) + return _handle_anthropic_non_stream(ctx, anthropic_payload, url, headers, turn) def _handle_anthropic_non_stream( @@ -586,7 +523,6 @@ def _handle_anthropic_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理 Anthropic 后端的非流式 Responses 返回。""" anthropic_payload['stream'] = False @@ -608,9 +544,6 @@ def _handle_anthropic_non_stream( client_model=ctx.client_model, turn=turn, debug_label='Messages 转回 Responses 后', - ctx=ctx, - request_started_at=request_started_at, - upstream_url=url, ) @@ -620,7 +553,6 @@ def _handle_anthropic_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, - request_started_at: float, ): """处理 Anthropic 后端的流式 Responses 返回。 @@ -676,18 +608,7 @@ def _handle_anthropic_stream( 'model': ctx.client_model, 'event_count': len(client_events), }) - duration_ms = int((perf_counter() - request_started_at) * 1000) - request_history.record( - route='responses', - client_model=ctx.client_model, - actual_model=ctx.upstream_model, - backend=ctx.backend, - upstream_url=url, - usage=None, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) - finalize_turn(turn, duration_ms=duration_ms) + finalize_turn(turn) return sse_response(generate()) @@ -698,9 +619,6 @@ def _finalize_responses_response( client_model: str, turn: dict[str, Any], debug_label: str, - ctx: RouteContext, - request_started_at: float, - upstream_url: str, ): """统一收尾非流式 Responses 响应。 @@ -711,26 +629,32 @@ def _finalize_responses_response( _dbg(debug_label + '=' + json.dumps(response_data, ensure_ascii=False, default=str)[:1000]) log_usage('响应生成', response_data.get('usage', {}), input_key='input_tokens', output_key='output_tokens') - usage = response_data.get('usage') - duration_ms = int((perf_counter() - request_started_at) * 1000) usage_tracker.record( client_model, - usage, + response_data.get('usage'), input_key='input_tokens', output_key='output_tokens', ) - request_history.record( - route='responses', - client_model=client_model, - actual_model=ctx.upstream_model, - backend=ctx.backend, - upstream_url=upstream_url, - usage=usage, - duration_ms=duration_ms, - started_at=(turn or {}).get('started_at'), - ) attach_client_response(turn, response_data) - finalize_turn(turn, usage=usage, duration_ms=duration_ms) + finalize_turn(turn, usage=response_data.get('usage')) + + output_items = response_data.get('output', []) + if isinstance(output_items, list): + for item in output_items: + if not isinstance(item, dict) or item.get('type') != 'reasoning': + continue + summary = item.get('summary', []) + if not isinstance(summary, list): + continue + reasoning_text = ''.join( + part.get('text', '') + for part in summary + if isinstance(part, dict) and part.get('type') == 'summary_text' + ) + if reasoning_text: + cc_messages = responses_to_cc(request.get_json(silent=True, force=True) or {}).get('messages', []) + thinking_cache.store_from_response(cc_messages, reasoning_text) + break return jsonify(response_data) diff --git a/static/admin.css b/static/admin.css index e52b5e9..875bbcb 100644 --- a/static/admin.css +++ b/static/admin.css @@ -9,7 +9,7 @@ body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI','PingFang SC','Micr input,select,button,textarea{font-family:inherit;font-size:inherit} a{color:var(--primary);text-decoration:none} code{background:var(--input);padding:1px 5px;border-radius:4px;font-size:12px;font-family:Consolas,Monaco,monospace} -.container{width:min(100%,1680px);margin:0 auto;padding:0 20px} +.container{max-width:960px;margin:0 auto;padding:0 20px} #login{display:flex;align-items:center;justify-content:center;min-height:100vh;background:linear-gradient(145deg,#0b1120 0%,#121a2e 50%,#0b1120 100%)} .login-card{background:var(--card);border:1px solid var(--border);border-radius:16px;padding:40px;width:380px;box-shadow:0 20px 60px rgba(0,0,0,.4)} @@ -83,11 +83,3 @@ main{padding:28px 0 60px} .toast-ok{background:#065f46;color:#a7f3d0} .toast-err{background:#7f1d1d;color:#fca5a5} @keyframes slideIn{from{transform:translateX(100px);opacity:0}to{transform:none;opacity:1}} - -.request-logs-wrap{overflow:auto} -.request-logs-table{min-width:1100px} -.request-logs-table td{vertical-align:top} -.log-url{max-width:320px;word-break:break-all;color:var(--muted)} -.log-status{display:inline-flex;align-items:center;padding:2px 8px;border-radius:999px;font-size:12px;font-weight:600} -.status-ok{background:rgba(34,197,94,.15);color:var(--green)} -.status-error{background:rgba(239,68,68,.15);color:var(--red)} diff --git a/static/admin.html b/static/admin.html index b4a0b85..5d382ad 100644 --- a/static/admin.html +++ b/static/admin.html @@ -4,7 +4,7 @@