471 lines
16 KiB
Python
471 lines
16 KiB
Python
"""对话级文件日志
|
||
|
||
将同一段多轮对话聚合到一个 JSON 文件中,而不是按单次请求散落成多个文件。
|
||
仅在详细日志模式开启时记录。
|
||
日志目录: data/conversations/YYYY-MM-DD/{conversation_id}.json
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import copy
|
||
import hashlib
|
||
import json
|
||
import logging
|
||
import os
|
||
import threading
|
||
from datetime import datetime
|
||
from typing import Any
|
||
|
||
from config import Config
|
||
from settings import DATA_DIR
|
||
import settings
|
||
from utils.http import gen_id
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
_LOG_DIR = os.path.join(DATA_DIR, 'conversations')
|
||
_LOCKS: dict[str, threading.Lock] = {}
|
||
_LOCKS_GUARD = threading.Lock()
|
||
_STREAM_KEEP_HEAD = 12
|
||
_STREAM_KEEP_TAIL = 12
|
||
|
||
|
||
def start_turn(
|
||
*,
|
||
route: str,
|
||
client_model: str,
|
||
backend: str,
|
||
stream: bool,
|
||
client_request: dict[str, Any],
|
||
request_headers: dict[str, Any] | None = None,
|
||
target_url: str = '',
|
||
upstream_model: str = '',
|
||
metadata: dict[str, Any] | None = None,
|
||
) -> dict[str, Any] | None:
|
||
"""创建一条新的对话 turn 上下文。"""
|
||
if settings.get_debug_mode() != 'verbose':
|
||
return None
|
||
|
||
now = datetime.utcnow().isoformat() + 'Z'
|
||
conversation_id = get_conversation_id(route=route, payload=client_request)
|
||
turn_id = gen_id('turn_')
|
||
return {
|
||
'conversation_id': conversation_id,
|
||
'turn_id': turn_id,
|
||
'route': route,
|
||
'client_model': client_model,
|
||
'backend': backend,
|
||
'stream': stream,
|
||
'target_url': target_url,
|
||
'upstream_model': upstream_model,
|
||
'started_at': now,
|
||
'updated_at': now,
|
||
'request_headers': sanitize_headers(request_headers or {}),
|
||
'client_request': deep_copy_jsonable(client_request),
|
||
'metadata': deep_copy_jsonable(metadata or {}),
|
||
'upstream_request': None,
|
||
'upstream_response': None,
|
||
'client_response': None,
|
||
'stream_trace': {
|
||
'upstream_events': [],
|
||
'client_events': [],
|
||
'upstream_total': 0,
|
||
'client_total': 0,
|
||
'upstream_dropped': 0,
|
||
'client_dropped': 0,
|
||
'summary': {},
|
||
},
|
||
'error': None,
|
||
}
|
||
|
||
|
||
def get_conversation_id(*, route: str, payload: dict[str, Any]) -> str:
|
||
"""尽量为同一段多轮对话生成稳定的会话 ID。"""
|
||
explicit = _pick_explicit_conversation_id(payload)
|
||
if explicit:
|
||
return _safe_id(explicit)
|
||
|
||
seed = _conversation_seed(route, payload)
|
||
digest = hashlib.sha256(seed.encode('utf-8')).hexdigest()[:24]
|
||
return f'conv_{digest}'
|
||
|
||
|
||
def attach_upstream_request(turn: dict[str, Any] | None, payload: dict[str, Any], headers: dict[str, Any] | None = None) -> None:
|
||
"""记录最终发往上游的请求。"""
|
||
if turn is None:
|
||
return
|
||
turn['upstream_request'] = {
|
||
'headers': sanitize_headers(headers or {}),
|
||
'body': deep_copy_jsonable(payload),
|
||
}
|
||
_touch(turn)
|
||
|
||
|
||
def attach_upstream_response(turn: dict[str, Any] | None, response_data: Any) -> None:
|
||
"""记录上游完整非流式响应。"""
|
||
if turn is None:
|
||
return
|
||
turn['upstream_response'] = deep_copy_jsonable(response_data)
|
||
_touch(turn)
|
||
|
||
|
||
def attach_client_response(turn: dict[str, Any] | None, response_data: Any) -> None:
|
||
"""记录最终返回给客户端的完整响应。"""
|
||
if turn is None:
|
||
return
|
||
turn['client_response'] = deep_copy_jsonable(response_data)
|
||
_touch(turn)
|
||
|
||
|
||
def append_upstream_event(turn: dict[str, Any] | None, event: Any) -> None:
|
||
"""记录一条上游流式事件,超限时截断保留头尾。"""
|
||
if turn is None:
|
||
return
|
||
_append_stream_event(turn['stream_trace'], 'upstream', deep_copy_jsonable(event))
|
||
_touch(turn)
|
||
|
||
|
||
def append_client_event(turn: dict[str, Any] | None, event: Any) -> None:
|
||
"""记录一条返回给客户端的流式事件,超限时截断保留头尾。"""
|
||
if turn is None:
|
||
return
|
||
_append_stream_event(turn['stream_trace'], 'client', deep_copy_jsonable(event))
|
||
_touch(turn)
|
||
|
||
|
||
def set_stream_summary(turn: dict[str, Any] | None, summary: dict[str, Any]) -> None:
|
||
"""记录流式摘要,例如累计文本、事件数、usage 等。"""
|
||
if turn is None:
|
||
return
|
||
turn['stream_trace']['summary'] = deep_copy_jsonable(summary)
|
||
_touch(turn)
|
||
|
||
|
||
def attach_error(turn: dict[str, Any] | None, error: Any) -> None:
|
||
"""记录错误信息。"""
|
||
if turn is None:
|
||
return
|
||
turn['error'] = deep_copy_jsonable(error)
|
||
_touch(turn)
|
||
|
||
|
||
def finalize_turn(
|
||
turn: dict[str, Any] | None,
|
||
*,
|
||
usage: dict[str, Any] | None = None,
|
||
duration_ms: int = 0,
|
||
) -> None:
|
||
"""将 turn 追加/更新到对应的会话日志文件。"""
|
||
if turn is None or settings.get_debug_mode() != 'verbose':
|
||
return
|
||
|
||
turn['updated_at'] = datetime.utcnow().isoformat() + 'Z'
|
||
turn['duration_ms'] = duration_ms
|
||
if usage is not None:
|
||
turn['usage'] = deep_copy_jsonable(usage)
|
||
|
||
stream_trace = turn.get('stream_trace', {})
|
||
summary = stream_trace.setdefault('summary', {})
|
||
summary['upstream_total'] = stream_trace.get('upstream_total', 0)
|
||
summary['client_total'] = stream_trace.get('client_total', 0)
|
||
summary['upstream_dropped'] = stream_trace.get('upstream_dropped', 0)
|
||
summary['client_dropped'] = stream_trace.get('client_dropped', 0)
|
||
if stream_trace.get('upstream_dropped', 0) or stream_trace.get('client_dropped', 0):
|
||
summary['truncated'] = True
|
||
|
||
threading.Thread(target=_write_turn, args=(deep_copy_jsonable(turn),), daemon=True).start()
|
||
|
||
|
||
def sanitize_headers(headers: dict[str, Any]) -> dict[str, Any]:
|
||
"""对敏感请求头做脱敏。"""
|
||
sanitized: dict[str, Any] = {}
|
||
for key, value in headers.items():
|
||
key_lower = str(key).lower()
|
||
if key_lower in {'authorization', 'x-api-key', 'api-key', 'x-goog-api-key'}:
|
||
sanitized[key] = _mask_secret(value)
|
||
else:
|
||
sanitized[key] = value
|
||
return sanitized
|
||
|
||
|
||
def deep_copy_jsonable(value: Any) -> Any:
|
||
"""尽量深拷贝 JSON 兼容数据。"""
|
||
try:
|
||
return copy.deepcopy(value)
|
||
except Exception:
|
||
try:
|
||
return json.loads(json.dumps(value, ensure_ascii=False, default=str))
|
||
except Exception:
|
||
return str(value)
|
||
|
||
|
||
def _write_turn(turn: dict[str, Any]) -> None:
|
||
conversation_id = turn['conversation_id']
|
||
lock = _get_lock(conversation_id)
|
||
with lock:
|
||
try:
|
||
date_str = turn['started_at'][:10]
|
||
day_dir = os.path.join(_LOG_DIR, date_str)
|
||
os.makedirs(day_dir, exist_ok=True)
|
||
filepath = os.path.join(day_dir, f'{conversation_id}.json')
|
||
|
||
if os.path.exists(filepath):
|
||
with open(filepath, 'r', encoding='utf-8') as f:
|
||
doc = json.load(f)
|
||
else:
|
||
doc = {
|
||
'conversation_id': conversation_id,
|
||
'route': turn.get('route', ''),
|
||
'created_at': turn['started_at'],
|
||
'updated_at': turn['updated_at'],
|
||
'turns': [],
|
||
}
|
||
|
||
turns = doc.setdefault('turns', [])
|
||
replaced = False
|
||
for index, existing in enumerate(turns):
|
||
if existing.get('turn_id') == turn.get('turn_id'):
|
||
turns[index] = turn
|
||
replaced = True
|
||
break
|
||
if not replaced:
|
||
turns.append(turn)
|
||
|
||
doc['updated_at'] = turn['updated_at']
|
||
doc['last_client_model'] = turn.get('client_model', '')
|
||
doc['last_backend'] = turn.get('backend', '')
|
||
doc['turn_count'] = len(turns)
|
||
|
||
with open(filepath, 'w', encoding='utf-8') as f:
|
||
json.dump(doc, f, ensure_ascii=False, indent=2, default=str)
|
||
except OSError as e:
|
||
logger.warning('写入对话日志失败: %s', e)
|
||
except json.JSONDecodeError as e:
|
||
logger.warning('解析对话日志失败: %s', e)
|
||
|
||
|
||
def _get_lock(conversation_id: str) -> threading.Lock:
|
||
with _LOCKS_GUARD:
|
||
if conversation_id not in _LOCKS:
|
||
_LOCKS[conversation_id] = threading.Lock()
|
||
return _LOCKS[conversation_id]
|
||
|
||
|
||
def _append_stream_event(stream_trace: dict[str, Any], kind: str, event: Any) -> None:
|
||
events_key = f'{kind}_events'
|
||
total_key = f'{kind}_total'
|
||
dropped_key = f'{kind}_dropped'
|
||
|
||
events = stream_trace.setdefault(events_key, [])
|
||
stream_trace[total_key] = stream_trace.get(total_key, 0) + 1
|
||
|
||
# 前 KEEP_HEAD 条完整保留;之后只保留最后 KEEP_TAIL 条,
|
||
# 中间部分通过 dropped 计数折叠,避免文件膨胀。
|
||
if len(events) < (_STREAM_KEEP_HEAD + _STREAM_KEEP_TAIL):
|
||
events.append(event)
|
||
return
|
||
|
||
head = events[:_STREAM_KEEP_HEAD]
|
||
tail = events[_STREAM_KEEP_HEAD:]
|
||
if len(tail) >= _STREAM_KEEP_TAIL:
|
||
tail.pop(0)
|
||
stream_trace[dropped_key] = stream_trace.get(dropped_key, 0) + 1
|
||
tail.append(event)
|
||
stream_trace[events_key] = head + tail
|
||
|
||
|
||
def _touch(turn: dict[str, Any] | None) -> None:
|
||
if turn is None:
|
||
return
|
||
turn['updated_at'] = datetime.utcnow().isoformat() + 'Z'
|
||
|
||
|
||
def _pick_explicit_conversation_id(payload: dict[str, Any]) -> str:
|
||
candidates = (
|
||
payload.get('conversation_id'),
|
||
payload.get('conversationId'),
|
||
payload.get('session_id'),
|
||
payload.get('sessionId'),
|
||
payload.get('chat_id'),
|
||
payload.get('chatId'),
|
||
payload.get('metadata', {}).get('conversation_id') if isinstance(payload.get('metadata'), dict) else None,
|
||
payload.get('metadata', {}).get('session_id') if isinstance(payload.get('metadata'), dict) else None,
|
||
)
|
||
for item in candidates:
|
||
if isinstance(item, str) and item.strip():
|
||
return item.strip()
|
||
return ''
|
||
|
||
|
||
def _conversation_seed(route: str, payload: dict[str, Any]) -> str:
|
||
"""生成稳定的对话种子。
|
||
|
||
关键原则:不能直接把整段历史消息都放进 seed,
|
||
否则每一轮历史增长都会导致 conversation_id 改变,最终每次请求都新建文件。
|
||
|
||
这里改为基于“对话根消息”生成种子:
|
||
- chat/messages: 第一条 user + 第一条 assistant(没有 assistant 时退化为第一条 user)
|
||
- responses: input 中的第一条 user + 第一条 assistant(没有 assistant 时退化为第一条 user)
|
||
"""
|
||
if route == 'chat':
|
||
return 'chat|' + _root_seed_from_messages(payload.get('messages', []))
|
||
|
||
if route == 'responses':
|
||
return 'responses|' + _root_seed_from_responses_input(payload)
|
||
|
||
if route == 'messages':
|
||
system = payload.get('system', '')
|
||
root = _root_seed_from_messages(payload.get('messages', []))
|
||
return 'messages|' + str(system) + '|' + root
|
||
|
||
return route + '|' + _pick_explicit_conversation_id(payload)
|
||
|
||
|
||
def _root_seed_from_messages(messages: Any) -> str:
|
||
if not isinstance(messages, list):
|
||
return ''
|
||
|
||
first_user = None
|
||
first_assistant = None
|
||
for msg in messages:
|
||
if not isinstance(msg, dict):
|
||
continue
|
||
role = msg.get('role', '')
|
||
if role in ('system', 'developer'):
|
||
continue
|
||
normalized = {
|
||
'role': role,
|
||
'content': _normalize_content(msg.get('content')),
|
||
'tool_call_id': msg.get('tool_call_id', ''),
|
||
'tool_calls': [
|
||
{
|
||
'id': tc.get('id', ''),
|
||
'name': (tc.get('function') or {}).get('name', ''),
|
||
}
|
||
for tc in msg.get('tool_calls', [])
|
||
if isinstance(tc, dict)
|
||
],
|
||
}
|
||
if role == 'user' and first_user is None:
|
||
first_user = normalized
|
||
elif role == 'assistant' and first_assistant is None:
|
||
first_assistant = normalized
|
||
if first_user is not None and first_assistant is not None:
|
||
break
|
||
|
||
seed_parts = []
|
||
if first_user is not None:
|
||
seed_parts.append(first_user)
|
||
if first_assistant is not None:
|
||
seed_parts.append(first_assistant)
|
||
return json.dumps(seed_parts, ensure_ascii=False, separators=(',', ':'))
|
||
|
||
|
||
def _root_seed_from_responses_input(payload: dict[str, Any]) -> str:
|
||
instructions = payload.get('instructions') or ''
|
||
input_data = payload.get('input', [])
|
||
|
||
if isinstance(input_data, str):
|
||
seed_input = input_data
|
||
elif isinstance(input_data, list):
|
||
seed_input = _root_seed_from_responses_items(input_data)
|
||
else:
|
||
seed_input = json.dumps(input_data, ensure_ascii=False, default=str)
|
||
|
||
return instructions + '|' + seed_input
|
||
|
||
|
||
def _root_seed_from_responses_items(items: list[Any]) -> str:
|
||
first_user = None
|
||
first_assistant = None
|
||
|
||
for item in items:
|
||
if not isinstance(item, dict):
|
||
continue
|
||
item_type = item.get('type', '')
|
||
role = item.get('role', '')
|
||
|
||
if item_type in ('message', 'input_text', 'output_text'):
|
||
normalized = {
|
||
'type': item_type,
|
||
'role': role,
|
||
'content': _normalize_content(
|
||
item.get('content')
|
||
or item.get('text')
|
||
or item.get('input_text')
|
||
or item.get('output_text')
|
||
or ''
|
||
),
|
||
}
|
||
if role == 'user' and first_user is None:
|
||
first_user = normalized
|
||
elif role == 'assistant' and first_assistant is None:
|
||
first_assistant = normalized
|
||
|
||
elif item_type == 'function_call' and first_assistant is None:
|
||
first_assistant = {
|
||
'type': 'function_call',
|
||
'name': item.get('name', ''),
|
||
'call_id': item.get('call_id', ''),
|
||
}
|
||
|
||
if first_user is not None and first_assistant is not None:
|
||
break
|
||
|
||
seed_parts = []
|
||
if first_user is not None:
|
||
seed_parts.append(first_user)
|
||
if first_assistant is not None:
|
||
seed_parts.append(first_assistant)
|
||
return json.dumps(seed_parts, ensure_ascii=False, separators=(',', ':'))
|
||
|
||
|
||
def _normalize_messages_seed(messages: Any) -> str:
|
||
if not isinstance(messages, list):
|
||
return ''
|
||
normalized: list[dict[str, Any]] = []
|
||
for msg in messages:
|
||
if not isinstance(msg, dict):
|
||
continue
|
||
normalized.append({
|
||
'role': msg.get('role', ''),
|
||
'content': _normalize_content(msg.get('content')),
|
||
'tool_call_id': msg.get('tool_call_id', ''),
|
||
'tool_calls': [
|
||
{
|
||
'id': tc.get('id', ''),
|
||
'name': (tc.get('function') or {}).get('name', ''),
|
||
}
|
||
for tc in msg.get('tool_calls', [])
|
||
if isinstance(tc, dict)
|
||
],
|
||
})
|
||
return json.dumps(normalized, ensure_ascii=False, separators=(',', ':'))
|
||
|
||
|
||
def _normalize_content(content: Any) -> Any:
|
||
if isinstance(content, str):
|
||
return content
|
||
if isinstance(content, list):
|
||
result = []
|
||
for item in content:
|
||
if isinstance(item, dict):
|
||
result.append(item)
|
||
else:
|
||
result.append(str(item))
|
||
return result
|
||
if content is None:
|
||
return ''
|
||
return str(content)
|
||
|
||
|
||
def _safe_id(raw: str) -> str:
|
||
cleaned = ''.join(ch if ch.isalnum() or ch in ('-', '_', '.') else '_' for ch in raw.strip())
|
||
return cleaned[:120] or gen_id('conv_')
|
||
|
||
|
||
def _mask_secret(value: Any) -> str:
|
||
text = str(value or '')
|
||
if len(text) <= 8:
|
||
return '***'
|
||
return text[:4] + '***' + text[-4:]
|