diff --git a/README.md b/README.md index b7b6d4c..6dcea52 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,18 @@ api2cursor/ - `file_path` → `path` 字段映射 - `finish_reason` 修正 + +============================ +增加缓存,在api2cursor里面的body修改中加个你喜欢的随意字段: +{ + "prompt_cache_key": "GPT5-4-xxx-xxx" +} +openai 开 fast 模式 +{ + "service_tier": "priority" +} + + ## 许可证 [MIT](LICENSE) diff --git a/adapters/cc_anthropic_adapter.py b/adapters/cc_anthropic_adapter.py index 512b3de..95aced6 100644 --- a/adapters/cc_anthropic_adapter.py +++ b/adapters/cc_anthropic_adapter.py @@ -261,6 +261,12 @@ def _convert_request_message(message: Any) -> tuple[JsonDict | None, str | None] anthropic_role = 'assistant' if role == 'assistant' else 'user' anthropic_content = _convert_content(message) + if role == 'assistant' and message.get('reasoning_content'): + thinking_block = {'type': 'thinking', 'thinking': message['reasoning_content']} + blocks = _to_blocks(anthropic_content) + blocks.insert(0, thinking_block) + anthropic_content = blocks + if role == 'assistant' and 'tool_calls' in message: anthropic_content = _append_tool_use_blocks(anthropic_content, message.get('tool_calls', [])) @@ -463,6 +469,8 @@ def _convert_content_part(part: Any) -> JsonDict | None: return {'type': 'text', 'text': part.get('text', '')} if part_type == 'image_url': return _convert_image(part) + if part_type == 'image': + return part if part_type in ('tool_use', 'tool_result'): return part return None @@ -574,38 +582,16 @@ _EPHEMERAL = {'type': 'ephemeral'} def optimize_cache_control(request: JsonDict) -> None: - """自动设置最优的 Anthropic cache_control 断点。 + """为 Anthropic Messages 请求启用顶层自动 prompt caching。 - 算法移植自 CursorProxy 的 ensure_cache_control.go: - 1. 归一化所有消息 content 为数组格式 - 2. 清空所有已有 cache_control - 3. 注入结构锚点(tools 末尾 + system 末尾) - 4. 注入消息锚点(最后一个可缓存块 + 窗口边界) - 5. 总断点数不超过 4 个 + 2026 版 Claude API 已支持在请求顶层使用 `cache_control` 开启自动缓存, + 由上游自动把断点放到最后一个可缓存块并随多轮对话前移。相比手动在嵌套 + content blocks 上打断点,这种方式对 Anthropic 兼容中转站更稳定,也更接近 + `/v1/responses` 通过顶层字段启用缓存的思路。 """ _normalize_message_contents(request) _clear_all_cache_controls(request) - - structural = _inject_structural_anchors(request) - remaining = _MAX_BREAKPOINTS - structural - if remaining <= 0: - return - - refs = _collect_cacheable_block_refs(request) - if not refs: - return - - desired = 1 if len(refs) < _BLOCK_WINDOW else 2 - anchors = min(desired, remaining) - - if anchors >= 1 and refs: - refs[-1]['cache_control'] = _EPHEMERAL - - if anchors >= 2 and len(refs) > 1: - target = len(refs) - _BLOCK_WINDOW - idx = _pick_window_anchor(refs, target) - if idx is not None and idx != len(refs) - 1: - refs[idx]['cache_control'] = _EPHEMERAL + request['cache_control'] = dict(_EPHEMERAL) def _normalize_message_contents(request: JsonDict) -> None: @@ -620,6 +606,7 @@ def _normalize_message_contents(request: JsonDict) -> None: def _clear_all_cache_controls(request: JsonDict) -> None: """清空所有已有的 cache_control 字段。""" + request.pop('cache_control', None) for tool in request.get('tools', []): tool.pop('cache_control', None) diff --git a/adapters/responses_cc_adapter.py b/adapters/responses_cc_adapter.py index b793dbd..e6c864a 100644 --- a/adapters/responses_cc_adapter.py +++ b/adapters/responses_cc_adapter.py @@ -654,10 +654,6 @@ class ResponsesToCCStreamConverter: 'completion_tokens': self._usage.get('output_tokens', 0), 'total_tokens': self._usage.get('total_tokens', 0), } - if isinstance(self._usage.get('input_tokens_details'), dict): - chunk['usage']['prompt_tokens_details'] = dict(self._usage['input_tokens_details']) - if isinstance(self._usage.get('output_tokens_details'), dict): - chunk['usage']['completion_tokens_details'] = dict(self._usage['output_tokens_details']) return [chunk] def _make_chunk(self, delta: JsonDict, finish_reason: str | None = None) -> JsonDict: @@ -682,44 +678,20 @@ def _copy_request_options(payload: JsonDict, result: JsonDict) -> None: """将 Responses 请求中的通用选项复制到 CC 请求体。""" if 'tools' in payload: result['tools'] = _convert_tools(payload['tools']) - for key in ( - 'temperature', - 'top_p', - 'tool_choice', - 'parallel_tool_calls', - 'truncation', - 'store', - 'metadata', - 'conversation', - 'previous_response_id', - 'prompt_cache_key', - 'service_tier', - 'user', - ): + for key in ('temperature', 'top_p'): if key in payload: result[key] = payload[key] if 'max_output_tokens' in payload: result['max_tokens'] = payload['max_output_tokens'] + if 'tool_choice' in payload: + result['tool_choice'] = payload['tool_choice'] def _copy_responses_request_options(payload: JsonDict, result: JsonDict) -> None: """将聊天补全请求中的通用选项复制到原生 Responses 请求体。""" if 'tools' in payload: result['tools'] = _convert_cc_tools_to_responses(payload['tools']) - for key in ( - 'temperature', - 'top_p', - 'tool_choice', - 'parallel_tool_calls', - 'truncation', - 'store', - 'metadata', - 'conversation', - 'previous_response_id', - 'prompt_cache_key', - 'service_tier', - 'user', - ): + for key in ('temperature', 'top_p', 'tool_choice'): if key in payload: result[key] = payload[key] if 'max_tokens' in payload: @@ -731,7 +703,11 @@ def _append_responses_input_item( instructions: list[str], input_items: list[JsonDict], ) -> None: - """将单条 Chat Completions 消息追加为 Responses `input` 项。""" + """将单条 Chat Completions 消息追加为 Responses `input` 项。 + + 尽量使用 EasyInputMessage 格式({role, content})以减少 token 开销, + 提高上游 prompt caching 的前缀匹配命中率。 + """ if not isinstance(message, dict): return @@ -752,21 +728,26 @@ def _append_responses_input_item( }) return - item: JsonDict = { - 'type': 'message', - 'role': role or 'user', - 'content': _content_to_responses_parts(content, role), - } - input_items.append(item) + text = _content_to_text(content) + has_tool_calls = bool(message.get('tool_calls')) - if role == 'assistant': + if role == 'assistant' and has_tool_calls: + if text: + input_items.append({ + 'type': 'message', + 'role': 'assistant', + 'content': [{'type': 'output_text', 'text': text}], + }) for tool_call in message.get('tool_calls') or []: input_items.append(_build_responses_function_call_item(tool_call)) + else: + input_items.append({'role': role or 'user', 'content': text or ''}) def _convert_input_items(items: list[Any], messages: list[JsonDict]) -> None: """将 Responses `input` 数组重建为 Chat Completions `messages` 列表。""" index = 0 + pending_reasoning: str | None = None while index < len(items): item = items[index] @@ -782,20 +763,35 @@ def _convert_input_items(items: list[Any], messages: list[JsonDict]) -> None: item_type = item.get('type', '') role = item.get('role', '') + if item_type == 'reasoning': + pending_reasoning = _extract_reasoning_text(item) + index += 1 + continue + if role and not item_type: - messages.append({ + msg: JsonDict = { 'role': role, 'content': _normalize_simple_content(item.get('content', '')), - }) + } + if role == 'assistant' and pending_reasoning: + msg['reasoning_content'] = pending_reasoning + pending_reasoning = None + messages.append(msg) index += 1 continue if item_type == 'message': consumed = _append_message_item(items, start=index, messages=messages) + if item.get('role') == 'assistant' and pending_reasoning and messages: + messages[-1]['reasoning_content'] = pending_reasoning + pending_reasoning = None index += consumed continue if item_type == 'function_call': + if pending_reasoning and messages and messages[-1].get('role') == 'assistant': + messages[-1]['reasoning_content'] = pending_reasoning + pending_reasoning = None _append_function_call_item(item, messages) index += 1 continue @@ -942,18 +938,11 @@ def _make_function_call_output_item(tool_call: JsonDict) -> JsonDict: def _build_responses_usage(usage: JsonDict) -> JsonDict: """将 Chat Completions 的 usage 字段映射为 Responses usage 结构。""" - result = { + return { 'input_tokens': usage.get('prompt_tokens', 0), 'output_tokens': usage.get('completion_tokens', 0), 'total_tokens': usage.get('total_tokens', 0), } - prompt_details = usage.get('prompt_tokens_details') - if isinstance(prompt_details, dict): - result['input_tokens_details'] = dict(prompt_details) - completion_details = usage.get('completion_tokens_details') - if isinstance(completion_details, dict): - result['output_tokens_details'] = dict(completion_details) - return result def _collect_cc_parts_from_responses_output(output_items: Any) -> tuple[str, str, list[JsonDict]]: diff --git a/compose.yml b/docker-compose.yml similarity index 92% rename from compose.yml rename to docker-compose.yml index 944b836..3d657de 100644 --- a/compose.yml +++ b/docker-compose.yml @@ -1,6 +1,7 @@ services: api2cursor: build: . + container_name: api2cursor ports: - "${PROXY_PORT:-3029}:${PROXY_PORT:-3029}" env_file: diff --git a/routes/admin.py b/routes/admin.py index e8a9e77..612ee89 100644 --- a/routes/admin.py +++ b/routes/admin.py @@ -13,6 +13,7 @@ from flask import Blueprint, request, jsonify, send_from_directory import settings from config import Config +from utils.request_history import request_history logger = logging.getLogger(__name__) @@ -202,6 +203,15 @@ def get_stats(): return jsonify(usage_tracker.get_stats()) +@bp.route('/api/admin/request-logs', methods=['GET']) +def get_request_logs(): + """返回最近 500 条请求日志。""" + err = _check_auth() + if err: + return err + return jsonify({'items': request_history.get_recent(500)}) + + # ─── 内部辅助 ───────────────────────────────────── diff --git a/routes/chat.py b/routes/chat.py index 8bea531..9532ed0 100644 --- a/routes/chat.py +++ b/routes/chat.py @@ -9,6 +9,7 @@ from __future__ import annotations import json import logging +from time import perf_counter from typing import Any import settings @@ -42,9 +43,7 @@ from routes.common import ( build_responses_target, build_route_context, chat_error_chunk, - ensure_responses_cache_control, - attach_previous_response_id, - remember_response_id, + ensure_prompt_cache_key, inject_instructions_anthropic, inject_instructions_cc, inject_instructions_responses, @@ -61,6 +60,7 @@ from utils.http import ( iter_responses_sse, sse_response, ) +from utils.request_history import request_history from utils.request_logger import ( append_client_event, append_upstream_event, @@ -115,6 +115,7 @@ def chat_completions(): client_model = payload.get('model', 'unknown') is_stream = payload.get('stream', False) ctx = build_route_context(client_model, is_stream) + request_started_at = perf_counter() turn = start_turn( route='chat', client_model=client_model, @@ -130,15 +131,16 @@ def chat_completions(): log_route_context('聊天补全', ctx, extra=f'消息数={message_count}') _log_messages(payload) - payload['messages'] = thinking_cache.inject(payload.get('messages', [])) + if ctx.backend != 'responses': + payload['messages'] = thinking_cache.inject(payload.get('messages', [])) if ctx.backend == 'openai': - return _handle_openai_backend(ctx, payload, turn) + return _handle_openai_backend(ctx, payload, turn, request_started_at) if ctx.backend == 'responses': - return _handle_responses_backend(ctx, payload, turn) + return _handle_responses_backend(ctx, payload, turn, request_started_at) if ctx.backend == 'gemini': - return _handle_gemini_backend(ctx, payload, turn) - return _handle_anthropic_backend(ctx, payload, turn) + return _handle_gemini_backend(ctx, payload, turn, request_started_at) + return _handle_anthropic_backend(ctx, payload, turn, request_started_at) def _normalize_chat_payload(payload: dict[str, Any]) -> tuple[dict[str, Any], int]: @@ -159,7 +161,12 @@ def _normalize_chat_payload(payload: dict[str, Any]) -> tuple[dict[str, Any], in return payload, message_count -def _handle_openai_backend(ctx: RouteContext, payload: dict[str, Any], turn: dict[str, Any]): +def _handle_openai_backend( + ctx: RouteContext, + payload: dict[str, Any], + turn: dict[str, Any], + request_started_at: float, +): """处理走 OpenAI 兼容后端的聊天补全请求。""" _dbg( '原始请求字段=' + str(list(payload.keys())) + ' ' @@ -183,8 +190,8 @@ def _handle_openai_backend(ctx: RouteContext, payload: dict[str, Any], turn: dic headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_openai_stream(ctx, payload, url, headers, turn) - return _handle_openai_non_stream(ctx, payload, url, headers, turn) + return _handle_openai_stream(ctx, payload, url, headers, turn, request_started_at) + return _handle_openai_non_stream(ctx, payload, url, headers, turn, request_started_at) def _handle_openai_non_stream( @@ -193,6 +200,7 @@ def _handle_openai_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any], + request_started_at: float, ): """处理 OpenAI 兼容后端的非流式返回。""" payload['stream'] = False @@ -208,7 +216,14 @@ def _handle_openai_non_stream( _dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000]) data = fix_response(raw) - return _finalize_chat_response(ctx, data, turn=turn, debug_label='修复后响应') + return _finalize_chat_response( + ctx, + data, + turn=turn, + debug_label='修复后响应', + request_started_at=request_started_at, + upstream_url=url, + ) def _handle_openai_stream( @@ -217,6 +232,7 @@ def _handle_openai_stream( url: str, headers: dict[str, str], turn: dict[str, Any], + request_started_at: float, ): """处理 OpenAI 兼容后端的流式返回。""" payload['stream'] = True @@ -259,7 +275,18 @@ def _handle_openai_stream( 'chunk_count': len(client_chunks), 'usage': last_usage, }) - finalize_turn(turn, usage=last_usage) + duration_ms = int((perf_counter() - request_started_at) * 1000) + request_history.record( + route='chat', + client_model=ctx.client_model, + actual_model=ctx.upstream_model, + backend=ctx.backend, + upstream_url=url, + usage=last_usage, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) + finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) return append_upstream_event(turn, {'type': 'openai_chunk', 'data': chunk}) @@ -300,12 +327,28 @@ def _handle_openai_stream( 'chunk_count': len(client_chunks), 'usage': last_usage, }) - finalize_turn(turn, usage=last_usage) + duration_ms = int((perf_counter() - request_started_at) * 1000) + request_history.record( + route='chat', + client_model=ctx.client_model, + actual_model=ctx.upstream_model, + backend=ctx.backend, + upstream_url=url, + usage=last_usage, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) + finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) return sse_response(generate()) -def _handle_responses_backend(ctx: RouteContext, payload: dict[str, Any], turn: dict[str, Any] | None): +def _handle_responses_backend( + ctx: RouteContext, + payload: dict[str, Any], + turn: dict[str, Any] | None, + request_started_at: float, +): """处理走原生 Responses 后端的聊天补全请求。 当上游只支持 `/v1/responses` 时,需要先把聊天补全请求转换为 Responses 请求, @@ -314,8 +357,7 @@ def _handle_responses_backend(ctx: RouteContext, payload: dict[str, Any], turn: responses_payload = cc_to_responses_request(payload) responses_payload['model'] = ctx.upstream_model responses_payload = inject_instructions_responses(responses_payload, ctx.custom_instructions, ctx.instructions_position) - responses_payload = ensure_responses_cache_control(responses_payload) - responses_payload = attach_previous_response_id(responses_payload) + responses_payload = ensure_prompt_cache_key(responses_payload) _dbg( '已转换为 Responses 请求:字段=' + str(list(responses_payload.keys())) + f' 输入项数={len(responses_payload.get("input", []))}' @@ -326,8 +368,8 @@ def _handle_responses_backend(ctx: RouteContext, payload: dict[str, Any], turn: headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_responses_stream(ctx, responses_payload, url, headers, turn) - return _handle_responses_non_stream(ctx, responses_payload, url, headers, turn) + return _handle_responses_stream(ctx, responses_payload, url, headers, turn, request_started_at) + return _handle_responses_non_stream(ctx, responses_payload, url, headers, turn, request_started_at) def _handle_responses_non_stream( @@ -336,6 +378,7 @@ def _handle_responses_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理原生 Responses 后端的非流式返回。""" payload['stream'] = False @@ -350,9 +393,15 @@ def _handle_responses_non_stream( attach_upstream_response(turn, raw) _dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000]) - remember_response_id(payload, raw) data = responses_to_cc_response(raw, ctx.client_model) - return _finalize_chat_response(ctx, data, turn=turn, debug_label='Responses 转回聊天补全后') + return _finalize_chat_response( + ctx, + data, + turn=turn, + debug_label='Responses 转回聊天补全后', + request_started_at=request_started_at, + upstream_url=url, + ) def _handle_responses_stream( @@ -361,6 +410,7 @@ def _handle_responses_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理原生 Responses 后端的流式返回。""" payload['stream'] = True @@ -389,10 +439,6 @@ def _handle_responses_stream( 'completion_tokens': extracted_usage.get('output_tokens', 0), 'total_tokens': extracted_usage.get('total_tokens', 0), } - if event_type == 'response.completed': - response_obj = event_data.get('response') if isinstance(event_data, dict) else None - if isinstance(response_obj, dict): - remember_response_id(payload, response_obj) if event_count < 10: _dbg( f'上游事件#{event_count} 类型={event_type} 数据=' @@ -428,12 +474,28 @@ def _handle_responses_stream( 'chunk_count': len(client_chunks), 'usage': last_usage, }) - finalize_turn(turn, usage=last_usage) + duration_ms = int((perf_counter() - request_started_at) * 1000) + request_history.record( + route='chat', + client_model=ctx.client_model, + actual_model=ctx.upstream_model, + backend=ctx.backend, + upstream_url=url, + usage=last_usage, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) + finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) return sse_response(generate()) -def _handle_gemini_backend(ctx: RouteContext, payload: dict[str, Any], turn: dict[str, Any] | None): +def _handle_gemini_backend( + ctx: RouteContext, + payload: dict[str, Any], + turn: dict[str, Any] | None, + request_started_at: float, +): """处理走 Gemini Contents 后端的聊天补全请求。""" payload = inject_instructions_cc(payload, ctx.custom_instructions, ctx.instructions_position) gemini_payload = cc_to_gemini_request(payload) @@ -447,8 +509,8 @@ def _handle_gemini_backend(ctx: RouteContext, payload: dict[str, Any], turn: dic headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_gemini_stream(ctx, gemini_payload, url, headers, turn) - return _handle_gemini_non_stream(ctx, gemini_payload, url, headers, turn) + return _handle_gemini_stream(ctx, gemini_payload, url, headers, turn, request_started_at) + return _handle_gemini_non_stream(ctx, gemini_payload, url, headers, turn, request_started_at) def _handle_gemini_non_stream( @@ -457,6 +519,7 @@ def _handle_gemini_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理 Gemini 后端的非流式返回。""" attach_upstream_request(turn, payload, headers) @@ -471,7 +534,14 @@ def _handle_gemini_non_stream( _dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000]) data = gemini_to_cc_response(raw) - return _finalize_chat_response(ctx, data, turn=turn, debug_label='Gemini 转回聊天补全后') + return _finalize_chat_response( + ctx, + data, + turn=turn, + debug_label='Gemini 转回聊天补全后', + request_started_at=request_started_at, + upstream_url=url, + ) def _handle_gemini_stream( @@ -480,6 +550,7 @@ def _handle_gemini_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理 Gemini 后端的流式返回。""" converter = GeminiStreamConverter() @@ -542,12 +613,28 @@ def _handle_gemini_stream( 'chunk_count': len(client_chunks), 'usage': last_usage, }) - finalize_turn(turn, usage=last_usage) + duration_ms = int((perf_counter() - request_started_at) * 1000) + request_history.record( + route='chat', + client_model=ctx.client_model, + actual_model=ctx.upstream_model, + backend=ctx.backend, + upstream_url=url, + usage=last_usage, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) + finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) return sse_response(generate()) -def _handle_anthropic_backend(ctx: RouteContext, payload: dict[str, Any], turn: dict[str, Any] | None): +def _handle_anthropic_backend( + ctx: RouteContext, + payload: dict[str, Any], + turn: dict[str, Any] | None, + request_started_at: float, +): """处理走 Anthropic Messages 后端的聊天补全请求。""" payload['model'] = ctx.upstream_model anthropic_payload = cc_to_messages_request(payload) @@ -562,8 +649,8 @@ def _handle_anthropic_backend(ctx: RouteContext, payload: dict[str, Any], turn: headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_anthropic_stream(ctx, anthropic_payload, url, headers, turn) - return _handle_anthropic_non_stream(ctx, anthropic_payload, url, headers, turn) + return _handle_anthropic_stream(ctx, anthropic_payload, url, headers, turn, request_started_at) + return _handle_anthropic_non_stream(ctx, anthropic_payload, url, headers, turn, request_started_at) def _handle_anthropic_non_stream( @@ -572,6 +659,7 @@ def _handle_anthropic_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理 Anthropic 后端的非流式返回。""" payload['stream'] = False @@ -587,7 +675,14 @@ def _handle_anthropic_non_stream( _dbg('上游原始响应=' + json.dumps(raw, ensure_ascii=False, default=str)[:1000]) data = messages_to_cc_response(raw) - return _finalize_chat_response(ctx, data, turn=turn, debug_label='Messages 转回聊天补全后') + return _finalize_chat_response( + ctx, + data, + turn=turn, + debug_label='Messages 转回聊天补全后', + request_started_at=request_started_at, + upstream_url=url, + ) def _handle_anthropic_stream( @@ -596,6 +691,7 @@ def _handle_anthropic_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理 Anthropic 后端的流式返回。 @@ -680,7 +776,18 @@ def _handle_anthropic_stream( 'chunk_count': len(client_chunks), 'usage': last_usage, }) - finalize_turn(turn, usage=last_usage) + duration_ms = int((perf_counter() - request_started_at) * 1000) + request_history.record( + route='chat', + client_model=ctx.client_model, + actual_model=ctx.upstream_model, + backend=ctx.backend, + upstream_url=url, + usage=last_usage, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) + finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) return sse_response(generate()) @@ -691,6 +798,8 @@ def _finalize_chat_response( *, turn: dict[str, Any] | None, debug_label: str, + request_started_at: float, + upstream_url: str, ): """统一收尾非流式聊天补全响应。 @@ -703,9 +812,21 @@ def _finalize_chat_response( _dbg(debug_label + '=' + json.dumps(data, ensure_ascii=False, default=str)[:1000]) log_usage('聊天补全', data.get('usage', {}), input_key='prompt_tokens', output_key='completion_tokens') - usage_tracker.record(ctx.client_model, data.get('usage')) + usage = data.get('usage') + duration_ms = int((perf_counter() - request_started_at) * 1000) + usage_tracker.record(ctx.client_model, usage) + request_history.record( + route='chat', + client_model=ctx.client_model, + actual_model=ctx.upstream_model, + backend=ctx.backend, + upstream_url=upstream_url, + usage=usage, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) attach_client_response(turn, data) - finalize_turn(turn, usage=data.get('usage')) + finalize_turn(turn, usage=usage, duration_ms=duration_ms) for choice in data.get('choices', []): msg = choice.get('message', {}) diff --git a/routes/common.py b/routes/common.py index f008b96..9f57233 100644 --- a/routes/common.py +++ b/routes/common.py @@ -10,8 +10,6 @@ from dataclasses import dataclass import hashlib import json import logging -import threading -import time from typing import Any import settings @@ -19,10 +17,6 @@ from utils.http import build_anthropic_headers, build_gemini_headers, build_open logger = logging.getLogger(__name__) -_RESPONSES_PREV_ID_LOCK = threading.Lock() -_RESPONSES_PREV_ID_TTL = 86400 -_RESPONSES_PREV_IDS: dict[str, tuple[str, float]] = {} - @dataclass(frozen=True) class RouteContext: @@ -202,178 +196,6 @@ def inject_instructions_responses(payload: dict[str, Any], instructions: str, po return payload -def ensure_responses_cache_control(payload: dict[str, Any]) -> dict[str, Any]: - """为 Responses 请求补齐自动 prompt caching 开关。 - - 一些支持 `/v1/responses` 的上游会参考顶层 `cache_control` 来自动放置缓存断点。 - Cursor 侧通常不会主动携带这个字段,因此这里在缺失时补一个保守的默认值, - 同时允许调用方通过 body_modifications 或显式字段自行覆盖/关闭。 - """ - if not isinstance(payload, dict): - return payload - cache_control = payload.get('cache_control') - if isinstance(cache_control, dict) and cache_control.get('type'): - return payload - payload['cache_control'] = {'type': 'ephemeral'} - logger.info('已为 Responses 请求自动启用 cache_control=ephemeral') - return payload - - -def attach_previous_response_id(payload: dict[str, Any]) -> dict[str, Any]: - """为多轮 Responses 请求补齐上一轮 response_id。 - - 某些上游在 `/v1/responses` 多轮场景下,只有沿用 `previous_response_id` 才能稳定复用 - 上一轮的服务端响应链与缓存。Cursor 通常会回传完整历史,但不会主动带这个字段, - 因此代理需要基于稳定对话键做一次轻量补齐。 - """ - if not isinstance(payload, dict) or payload.get('previous_response_id'): - return payload - key = _responses_prev_id_key(payload) - if not key: - return payload - previous_response_id = _get_previous_response_id(key) - if not previous_response_id: - return payload - payload['previous_response_id'] = previous_response_id - logger.info('已为 Responses 请求补齐 previous_response_id') - return payload - - -def remember_response_id(payload: dict[str, Any], response_data: dict[str, Any]) -> None: - """记住当前对话最近一次上游 Responses response_id。""" - if not isinstance(payload, dict) or not isinstance(response_data, dict): - return - response_id = response_data.get('id') - if not isinstance(response_id, str) or not response_id.strip(): - return - key = _responses_prev_id_key(payload) - if not key: - return - with _RESPONSES_PREV_ID_LOCK: - _RESPONSES_PREV_IDS[key] = (response_id.strip(), time.time()) - _cleanup_previous_response_ids_locked() - - -def _responses_prev_id_key(payload: dict[str, Any]) -> str: - """基于 Responses 请求的“对话根信息”生成稳定键。 - - 这里故意不直接使用完整 `input` 作为键,因为多轮对话每轮都会追加历史; - 如果把整段历史都纳入哈希,键会在每一轮变化,导致无法稳定取回上一轮的 - `previous_response_id`。当前策略只取 instructions 与首轮 user/assistant 根消息。 - """ - instructions = payload.get('instructions') or '' - input_data = payload.get('input', []) - if isinstance(input_data, str): - seed_input = input_data - elif isinstance(input_data, list): - seed_input = _responses_root_seed_from_items(input_data) - else: - seed_input = json.dumps(input_data, ensure_ascii=False, default=str) - raw = instructions + '|' + seed_input - if not raw.strip('|'): - return '' - return hashlib.sha256(raw.encode('utf-8')).hexdigest()[:24] - - -def _responses_root_seed_from_items(items: list[Any]) -> str: - """从 Responses `input` 中提取足够稳定的对话根片段。 - - 目标不是完整还原会话,而是构造一个在同一段对话内尽量恒定、跨轮次可复用的 - seed。这里沿用项目里 conversation seed 的思路:优先取第一条 user 与第一条 - assistant;如果 assistant 还不存在,则只用第一条 user。 - """ - first_user = None - first_assistant = None - for item in items: - if isinstance(item, str): - if first_user is None: - first_user = {'role': 'user', 'content': item} - continue - if not isinstance(item, dict): - continue - item_type = item.get('type', '') - role = item.get('role', '') - if item_type == 'message' and role in ('user', 'assistant'): - normalized = { - 'role': role, - 'content': _responses_normalize_content(item.get('content', [])), - } - if role == 'user' and first_user is None: - first_user = normalized - elif role == 'assistant' and first_assistant is None: - first_assistant = normalized - elif role in ('user', 'assistant') and not item_type: - normalized = { - 'role': role, - 'content': _responses_normalize_content(item.get('content', '')), - } - if role == 'user' and first_user is None: - first_user = normalized - elif role == 'assistant' and first_assistant is None: - first_assistant = normalized - if first_user is not None and first_assistant is not None: - break - parts = [] - if first_user is not None: - parts.append(first_user) - if first_assistant is not None: - parts.append(first_assistant) - return json.dumps(parts, ensure_ascii=False, separators=(',', ':')) - - -def _responses_normalize_content(content: Any) -> str: - """把 Responses 各种 content 形态折叠成稳定文本。 - - 这里的目标不是保真展示,而是降低结构差异对 key 计算的影响;只抽取会影响 - 会话根语义的文本型内容,忽略无关字段,避免同一轮请求因格式细节不同而得到 - 不同的 previous_response_id 键。 - """ - if isinstance(content, str): - return content.strip() - if not isinstance(content, list): - return str(content).strip() if content is not None else '' - texts: list[str] = [] - for part in content: - if isinstance(part, str): - texts.append(part) - continue - if not isinstance(part, dict): - continue - if part.get('type') in ('input_text', 'output_text', 'text'): - texts.append(part.get('text', '')) - elif part.get('type') == 'summary_text': - texts.append(part.get('text', '')) - return '\n'.join(texts).strip() - - -def _get_previous_response_id(key: str) -> str: - """按稳定键读取上一轮 response_id,并在过期时顺手清理。""" - with _RESPONSES_PREV_ID_LOCK: - entry = _RESPONSES_PREV_IDS.get(key) - if not entry: - return '' - response_id, ts = entry - if (time.time() - ts) >= _RESPONSES_PREV_ID_TTL: - _RESPONSES_PREV_IDS.pop(key, None) - return '' - return response_id - - -def _cleanup_previous_response_ids_locked() -> None: - """清理过期的 previous_response_id 缓存项。 - - 这张表只用于短期多轮续接;一旦对话长时间不活跃,就不再需要继续保留, - 以免常驻进程运行过久后累计过多失效状态。 - """ - now = time.time() - expired = [ - key for key, (_, ts) in _RESPONSES_PREV_IDS.items() - if (now - ts) >= _RESPONSES_PREV_ID_TTL - ] - for key in expired: - _RESPONSES_PREV_IDS.pop(key, None) - - def inject_instructions_anthropic(payload: dict[str, Any], instructions: str, position: str = 'prepend') -> dict[str, Any]: """向 Anthropic Messages 请求注入自定义指令(写入 system 字段)。 @@ -397,6 +219,23 @@ def inject_instructions_anthropic(payload: dict[str, Any], instructions: str, po # ─── Body / Header 修改 ────────────────────────── +def ensure_prompt_cache_key(payload: dict[str, Any]) -> dict[str, Any]: + """确保 Responses 请求携带 prompt_cache_key 以启用上游提示缓存。 + + 上游(如 sub2api)对原生 /v1/responses 请求不会自动生成 prompt_cache_key, + 导致提示缓存无法命中。这里根据模型名 + instructions 生成稳定的 cache key, + 使得相同模型和系统提示的对话可以共享缓存前缀。 + """ + if payload.get('prompt_cache_key'): + return payload + + model = payload.get('model', '') + instructions = payload.get('instructions', '') + seed = f'{model}|{instructions}' + payload['prompt_cache_key'] = hashlib.sha256(seed.encode()).hexdigest()[:32] + return payload + + def apply_body_modifications(payload: dict[str, Any], modifications: dict[str, Any]) -> dict[str, Any]: """对转发请求体应用字段级修改。 diff --git a/routes/messages.py b/routes/messages.py index 0d9faa5..a320081 100644 --- a/routes/messages.py +++ b/routes/messages.py @@ -7,6 +7,7 @@ Anthropic Messages API 透传。当 Cursor 直接发送 Anthropic 格式请求 import json import logging +from time import perf_counter import requests as req_lib from flask import Blueprint, request, jsonify @@ -15,6 +16,7 @@ import settings from config import Config from routes.common import apply_body_modifications, apply_header_modifications, inject_instructions_anthropic from utils.http import build_anthropic_headers, forward_request, sse_response +from utils.request_history import request_history from utils.request_logger import ( append_client_event, append_upstream_event, @@ -40,6 +42,7 @@ def messages_passthrough(): model = payload.get('model', 'unknown') is_stream = payload.get('stream', False) + request_started_at = perf_counter() logger.info(f'[透传] model={model} 流式={is_stream}') mapping = settings.resolve_model(model) @@ -78,7 +81,18 @@ def messages_passthrough(): attach_upstream_response(turn, data) _inject_thinking(data) attach_client_response(turn, data) - finalize_turn(turn) + duration_ms = int((perf_counter() - request_started_at) * 1000) + request_history.record( + route='messages', + client_model=model, + actual_model=model, + backend='anthropic', + upstream_url=url, + usage=data.get('usage'), + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) + finalize_turn(turn, usage=data.get('usage'), duration_ms=duration_ms) return jsonify(data) def generate(): @@ -108,7 +122,18 @@ def messages_passthrough(): 'type': 'messages.stream.summary', 'event_count': len(client_events), }) - finalize_turn(turn) + duration_ms = int((perf_counter() - request_started_at) * 1000) + request_history.record( + route='messages', + client_model=model, + actual_model=model, + backend='anthropic', + upstream_url=url, + usage=None, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) + finalize_turn(turn, duration_ms=duration_ms) except req_lib.RequestException as e: logger.error(f'请求上游失败: {e}') attach_error(turn, {'stage': 'request_exception', 'message': str(e)}) diff --git a/routes/responses.py b/routes/responses.py index dd32d5c..271c30f 100644 --- a/routes/responses.py +++ b/routes/responses.py @@ -8,6 +8,7 @@ from __future__ import annotations import json import logging +from time import perf_counter from typing import Any import settings @@ -27,9 +28,7 @@ from routes.common import ( build_openai_target, build_responses_target, build_route_context, - ensure_responses_cache_control, - attach_previous_response_id, - remember_response_id, + ensure_prompt_cache_key, inject_instructions_anthropic, inject_instructions_cc, inject_instructions_responses, @@ -46,6 +45,7 @@ from utils.http import ( iter_responses_sse, sse_response, ) +from utils.request_history import request_history from utils.request_logger import ( append_client_event, append_upstream_event, @@ -80,6 +80,7 @@ def responses_endpoint(): client_model = payload.get('model', 'unknown') is_stream = payload.get('stream', False) + request_started_at = perf_counter() ctx = build_route_context(client_model, is_stream) turn = start_turn( route='responses', @@ -96,12 +97,12 @@ def responses_endpoint(): cc_payload = _build_cc_payload(payload, ctx) if ctx.backend == 'openai': - return _handle_openai_backend(ctx, cc_payload, turn) + return _handle_openai_backend(ctx, cc_payload, turn, request_started_at) if ctx.backend == 'responses': - return _handle_responses_backend(ctx, payload, turn) + return _handle_responses_backend(ctx, payload, turn, request_started_at) if ctx.backend == 'gemini': - return _handle_gemini_backend(ctx, cc_payload, turn) - return _handle_anthropic_backend(ctx, cc_payload, turn) + return _handle_gemini_backend(ctx, cc_payload, turn, request_started_at) + return _handle_anthropic_backend(ctx, cc_payload, turn, request_started_at) def _build_cc_payload(payload: dict[str, Any], ctx: RouteContext) -> dict[str, Any]: @@ -121,7 +122,12 @@ def _build_cc_payload(payload: dict[str, Any], ctx: RouteContext) -> dict[str, A return cc_payload -def _handle_openai_backend(ctx: RouteContext, cc_payload: dict[str, Any], turn: dict[str, Any]): +def _handle_openai_backend( + ctx: RouteContext, + cc_payload: dict[str, Any], + turn: dict[str, Any], + request_started_at: float, +): """处理走 OpenAI 兼容后端的 Responses 请求。""" cc_payload = normalize_request(cc_payload) _dbg( @@ -134,8 +140,8 @@ def _handle_openai_backend(ctx: RouteContext, cc_payload: dict[str, Any], turn: headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_openai_stream(ctx, cc_payload, url, headers, turn) - return _handle_openai_non_stream(ctx, cc_payload, url, headers, turn) + return _handle_openai_stream(ctx, cc_payload, url, headers, turn, request_started_at) + return _handle_openai_non_stream(ctx, cc_payload, url, headers, turn, request_started_at) def _handle_openai_non_stream( @@ -144,6 +150,7 @@ def _handle_openai_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any], + request_started_at: float, ): """处理 OpenAI 兼容后端的非流式 Responses 返回。""" cc_payload['stream'] = False @@ -165,6 +172,9 @@ def _handle_openai_non_stream( client_model=ctx.client_model, turn=turn, debug_label='转换为 Responses 后', + ctx=ctx, + request_started_at=request_started_at, + upstream_url=url, ) @@ -174,6 +184,7 @@ def _handle_openai_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理 OpenAI 兼容后端的流式 Responses 返回。""" cc_payload['stream'] = True @@ -214,7 +225,18 @@ def _handle_openai_stream( 'model': ctx.client_model, 'event_count': len(client_events), }) - finalize_turn(turn) + duration_ms = int((perf_counter() - request_started_at) * 1000) + request_history.record( + route='responses', + client_model=ctx.client_model, + actual_model=ctx.upstream_model, + backend=ctx.backend, + upstream_url=url, + usage=None, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) + finalize_turn(turn, duration_ms=duration_ms) return append_upstream_event(turn, {'type': 'openai_chunk', 'data': chunk}) @@ -241,7 +263,12 @@ def _handle_openai_stream( return sse_response(generate()) -def _handle_responses_backend(ctx: RouteContext, payload: dict[str, Any], turn: dict[str, Any] | None): +def _handle_responses_backend( + ctx: RouteContext, + payload: dict[str, Any], + turn: dict[str, Any] | None, + request_started_at: float, +): """处理走原生 Responses 后端的请求。 当中转站本身就只支持 `/v1/responses` 时,不需要再绕到聊天补全中间协议, @@ -250,15 +277,14 @@ def _handle_responses_backend(ctx: RouteContext, payload: dict[str, Any], turn: payload = dict(payload) payload['model'] = ctx.upstream_model payload = inject_instructions_responses(payload, ctx.custom_instructions, ctx.instructions_position) - payload = ensure_responses_cache_control(payload) - payload = attach_previous_response_id(payload) + payload = ensure_prompt_cache_key(payload) url, headers = build_responses_target(ctx) payload = apply_body_modifications(payload, ctx.body_modifications) headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_responses_stream(ctx, payload, url, headers, turn) - return _handle_responses_non_stream(ctx, payload, url, headers, turn) + return _handle_responses_stream(ctx, payload, url, headers, turn, request_started_at) + return _handle_responses_non_stream(ctx, payload, url, headers, turn, request_started_at) def _handle_responses_non_stream( @@ -267,6 +293,7 @@ def _handle_responses_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理原生 Responses 后端的非流式返回。""" payload['stream'] = False @@ -279,13 +306,15 @@ def _handle_responses_non_stream( response_data = resp.json() attach_upstream_response(turn, response_data) - remember_response_id(payload, response_data) response_data['model'] = ctx.client_model return _finalize_responses_response( response_data, client_model=ctx.client_model, turn=turn, debug_label='原生 Responses 返回后', + ctx=ctx, + request_started_at=request_started_at, + upstream_url=url, ) @@ -295,6 +324,7 @@ def _handle_responses_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理原生 Responses 后端的流式返回。""" payload['stream'] = True @@ -319,10 +349,6 @@ def _handle_responses_stream( extracted_usage = _extract_responses_usage(event_data) if extracted_usage: last_usage = extracted_usage - if event_type == 'response.completed': - response_obj = event_data.get('response') if isinstance(event_data, dict) else None - if isinstance(response_obj, dict): - remember_response_id(payload, response_obj) if event_count < 10: _dbg( f'上游事件#{event_count} 类型={event_type} 数据=' @@ -353,7 +379,18 @@ def _handle_responses_stream( 'event_count': len(client_events), 'usage': last_usage, }) - finalize_turn(turn, usage=last_usage) + duration_ms = int((perf_counter() - request_started_at) * 1000) + request_history.record( + route='responses', + client_model=ctx.client_model, + actual_model=ctx.upstream_model, + backend=ctx.backend, + upstream_url=url, + usage=last_usage, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) + finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) return sse_response(generate()) @@ -377,7 +414,12 @@ def _extract_responses_usage(event_data: dict[str, Any]) -> dict[str, Any] | Non return None -def _handle_gemini_backend(ctx: RouteContext, cc_payload: dict[str, Any], turn: dict[str, Any] | None): +def _handle_gemini_backend( + ctx: RouteContext, + cc_payload: dict[str, Any], + turn: dict[str, Any] | None, + request_started_at: float, +): """处理走 Gemini Contents 后端的 Responses 请求。""" gemini_payload = cc_to_gemini_request(cc_payload) _dbg( @@ -390,8 +432,8 @@ def _handle_gemini_backend(ctx: RouteContext, cc_payload: dict[str, Any], turn: headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_gemini_stream(ctx, gemini_payload, url, headers, turn) - return _handle_gemini_non_stream(ctx, gemini_payload, url, headers, turn) + return _handle_gemini_stream(ctx, gemini_payload, url, headers, turn, request_started_at) + return _handle_gemini_non_stream(ctx, gemini_payload, url, headers, turn, request_started_at) def _handle_gemini_non_stream( @@ -400,6 +442,7 @@ def _handle_gemini_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理 Gemini 后端的非流式 Responses 返回。""" attach_upstream_request(turn, payload, headers) @@ -420,6 +463,9 @@ def _handle_gemini_non_stream( client_model=ctx.client_model, turn=turn, debug_label='Gemini 转回 Responses 后', + ctx=ctx, + request_started_at=request_started_at, + upstream_url=url, ) @@ -429,6 +475,7 @@ def _handle_gemini_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理 Gemini 后端的流式 Responses 返回。""" converter = ResponsesStreamConverter(model=ctx.client_model) @@ -495,12 +542,28 @@ def _handle_gemini_stream( 'event_count': len(client_events), 'usage': last_usage, }) - finalize_turn(turn, usage=last_usage) + duration_ms = int((perf_counter() - request_started_at) * 1000) + request_history.record( + route='responses', + client_model=ctx.client_model, + actual_model=ctx.upstream_model, + backend=ctx.backend, + upstream_url=url, + usage=last_usage, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) + finalize_turn(turn, usage=last_usage, duration_ms=duration_ms) return sse_response(generate()) -def _handle_anthropic_backend(ctx: RouteContext, cc_payload: dict[str, Any], turn: dict[str, Any] | None): +def _handle_anthropic_backend( + ctx: RouteContext, + cc_payload: dict[str, Any], + turn: dict[str, Any] | None, + request_started_at: float, +): """处理走 Anthropic 后端的 Responses 请求。""" anthropic_payload = cc_to_messages_request(cc_payload) _dbg( @@ -513,8 +576,8 @@ def _handle_anthropic_backend(ctx: RouteContext, cc_payload: dict[str, Any], tur headers = apply_header_modifications(headers, ctx.header_modifications) if ctx.is_stream: - return _handle_anthropic_stream(ctx, anthropic_payload, url, headers, turn) - return _handle_anthropic_non_stream(ctx, anthropic_payload, url, headers, turn) + return _handle_anthropic_stream(ctx, anthropic_payload, url, headers, turn, request_started_at) + return _handle_anthropic_non_stream(ctx, anthropic_payload, url, headers, turn, request_started_at) def _handle_anthropic_non_stream( @@ -523,6 +586,7 @@ def _handle_anthropic_non_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理 Anthropic 后端的非流式 Responses 返回。""" anthropic_payload['stream'] = False @@ -544,6 +608,9 @@ def _handle_anthropic_non_stream( client_model=ctx.client_model, turn=turn, debug_label='Messages 转回 Responses 后', + ctx=ctx, + request_started_at=request_started_at, + upstream_url=url, ) @@ -553,6 +620,7 @@ def _handle_anthropic_stream( url: str, headers: dict[str, str], turn: dict[str, Any] | None, + request_started_at: float, ): """处理 Anthropic 后端的流式 Responses 返回。 @@ -608,7 +676,18 @@ def _handle_anthropic_stream( 'model': ctx.client_model, 'event_count': len(client_events), }) - finalize_turn(turn) + duration_ms = int((perf_counter() - request_started_at) * 1000) + request_history.record( + route='responses', + client_model=ctx.client_model, + actual_model=ctx.upstream_model, + backend=ctx.backend, + upstream_url=url, + usage=None, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) + finalize_turn(turn, duration_ms=duration_ms) return sse_response(generate()) @@ -619,6 +698,9 @@ def _finalize_responses_response( client_model: str, turn: dict[str, Any], debug_label: str, + ctx: RouteContext, + request_started_at: float, + upstream_url: str, ): """统一收尾非流式 Responses 响应。 @@ -629,32 +711,26 @@ def _finalize_responses_response( _dbg(debug_label + '=' + json.dumps(response_data, ensure_ascii=False, default=str)[:1000]) log_usage('响应生成', response_data.get('usage', {}), input_key='input_tokens', output_key='output_tokens') + usage = response_data.get('usage') + duration_ms = int((perf_counter() - request_started_at) * 1000) usage_tracker.record( client_model, - response_data.get('usage'), + usage, input_key='input_tokens', output_key='output_tokens', ) + request_history.record( + route='responses', + client_model=client_model, + actual_model=ctx.upstream_model, + backend=ctx.backend, + upstream_url=upstream_url, + usage=usage, + duration_ms=duration_ms, + started_at=(turn or {}).get('started_at'), + ) attach_client_response(turn, response_data) - finalize_turn(turn, usage=response_data.get('usage')) - - output_items = response_data.get('output', []) - if isinstance(output_items, list): - for item in output_items: - if not isinstance(item, dict) or item.get('type') != 'reasoning': - continue - summary = item.get('summary', []) - if not isinstance(summary, list): - continue - reasoning_text = ''.join( - part.get('text', '') - for part in summary - if isinstance(part, dict) and part.get('type') == 'summary_text' - ) - if reasoning_text: - cc_messages = responses_to_cc(request.get_json(silent=True, force=True) or {}).get('messages', []) - thinking_cache.store_from_response(cc_messages, reasoning_text) - break + finalize_turn(turn, usage=usage, duration_ms=duration_ms) return jsonify(response_data) diff --git a/static/admin.css b/static/admin.css index 875bbcb..e52b5e9 100644 --- a/static/admin.css +++ b/static/admin.css @@ -9,7 +9,7 @@ body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI','PingFang SC','Micr input,select,button,textarea{font-family:inherit;font-size:inherit} a{color:var(--primary);text-decoration:none} code{background:var(--input);padding:1px 5px;border-radius:4px;font-size:12px;font-family:Consolas,Monaco,monospace} -.container{max-width:960px;margin:0 auto;padding:0 20px} +.container{width:min(100%,1680px);margin:0 auto;padding:0 20px} #login{display:flex;align-items:center;justify-content:center;min-height:100vh;background:linear-gradient(145deg,#0b1120 0%,#121a2e 50%,#0b1120 100%)} .login-card{background:var(--card);border:1px solid var(--border);border-radius:16px;padding:40px;width:380px;box-shadow:0 20px 60px rgba(0,0,0,.4)} @@ -83,3 +83,11 @@ main{padding:28px 0 60px} .toast-ok{background:#065f46;color:#a7f3d0} .toast-err{background:#7f1d1d;color:#fca5a5} @keyframes slideIn{from{transform:translateX(100px);opacity:0}to{transform:none;opacity:1}} + +.request-logs-wrap{overflow:auto} +.request-logs-table{min-width:1100px} +.request-logs-table td{vertical-align:top} +.log-url{max-width:320px;word-break:break-all;color:var(--muted)} +.log-status{display:inline-flex;align-items:center;padding:2px 8px;border-radius:999px;font-size:12px;font-weight:600} +.status-ok{background:rgba(34,197,94,.15);color:var(--green)} +.status-error{background:rgba(239,68,68,.15);color:var(--red)} diff --git a/static/admin.html b/static/admin.html index 5d382ad..b4a0b85 100644 --- a/static/admin.html +++ b/static/admin.html @@ -4,7 +4,7 @@