feat(storage): 引入任务日志记录功能
- 新增 MongoLogRepository 类用于记录任务执行日志 - 在 FlowRunner 中集成日志记录,跟踪流程执行步骤 - 记录登录与业务流程的开始、成功、错误与中止状态 - 支持记录 XML 来源与快照(超长内容自动截断)- 扩展 README 文档说明日志字段与使用方式 -优化分页抓取逻辑,返回页面与记录统计数据 - 统一存储接口,暴露 MongoDB 客户端与数据库属性 - 增加步骤计数与执行时间统计功能
This commit is contained in:
21
README.md
21
README.md
@@ -243,6 +243,27 @@ python main.py
|
|||||||
- `<excel_extract>` 用于声明需下载并解析 Excel 文件,当前实现会给出提示,具体逻辑可在 `FlowRunner._handle_excel_extract` 自行扩展。
|
- `<excel_extract>` 用于声明需下载并解析 Excel 文件,当前实现会给出提示,具体逻辑可在 `FlowRunner._handle_excel_extract` 自行扩展。
|
||||||
- `MongoRepository.save_records` 会按照 `flow.unique_keys` 与 `unique_columns` 控制去重策略。
|
- `MongoRepository.save_records` 会按照 `flow.unique_keys` 与 `unique_columns` 控制去重策略。
|
||||||
|
|
||||||
|
## 任务日志
|
||||||
|
|
||||||
|
每次执行登录流程或业务流程都会写入 MongoDB 的 `task_logs` 集合,记录执行状态与关键步骤,方便排查问题。
|
||||||
|
|
||||||
|
- 基础字段:
|
||||||
|
- `_id`:UUID,全局唯一的任务编号。
|
||||||
|
- `site_id` / `flow_id`:对应模板中的站点与流程。
|
||||||
|
- `status`:`start`、`success`、`error`、`aborted`。
|
||||||
|
- `started_at` / `ended_at`:UTC 时间戳,`duration_ms` 为执行耗时。
|
||||||
|
- `steps`:执行步骤列表,包含动作类型、分页点击、抽取结果等摘要。
|
||||||
|
- `metadata`:附加信息(如保存记录数量、分页页数、是否登录流程)。
|
||||||
|
- `xml_source`:原始 XML 的来源地址;`xml_snapshot`:当前执行所用 XML(超出 50k 字符会截断并标记)。
|
||||||
|
- `error` / `abort_reason`:当流程异常或被中止时记录详细信息(包含堆栈)。
|
||||||
|
- 日志写入策略:
|
||||||
|
- 流程开始时插入 `status=start` 的文档;
|
||||||
|
- 正常结束时更新为 `success` 并补充 `steps`、`summary`、`metadata`;
|
||||||
|
- 运行中出现异常时标记为 `error` 并写入堆栈;
|
||||||
|
- 收到 `KeyboardInterrupt` 或其它不可恢复的中断时标记为 `aborted`。
|
||||||
|
|
||||||
|
可以根据 `site_id + flow_id + started_at` 建立索引用于快速查询,也可按 `status` 过滤待排查的任务。
|
||||||
|
|
||||||
## 开发与调试建议
|
## 开发与调试建议
|
||||||
|
|
||||||
- 日志级别默认 `INFO`,可自行设置环境变量 `PYTHONLOGGING` 或修改 `configure_logging`。
|
- 日志级别默认 `INFO`,可自行设置环境变量 `PYTHONLOGGING` 或修改 `configure_logging`。
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import requests
|
|||||||
from .redis_queue import RedisConfigQueue
|
from .redis_queue import RedisConfigQueue
|
||||||
from .runner import FlowRunner
|
from .runner import FlowRunner
|
||||||
from .settings import Settings
|
from .settings import Settings
|
||||||
from .storage import MongoRepository
|
from .storage import MongoLogRepository, MongoRepository
|
||||||
from .variables import VariableService
|
from .variables import VariableService
|
||||||
from .xml_parser import XMLSiteParser
|
from .xml_parser import XMLSiteParser
|
||||||
|
|
||||||
@@ -29,10 +29,15 @@ class TemplateCrawlerApp:
|
|||||||
self.settings.mongo_uri,
|
self.settings.mongo_uri,
|
||||||
self.settings.mongo_database,
|
self.settings.mongo_database,
|
||||||
)
|
)
|
||||||
|
self.log_repository = MongoLogRepository(
|
||||||
|
self.mongo.client,
|
||||||
|
self.settings.mongo_database,
|
||||||
|
)
|
||||||
self.variable_service = VariableService(self.settings.redis_url)
|
self.variable_service = VariableService(self.settings.redis_url)
|
||||||
self.parser = XMLSiteParser()
|
self.parser = XMLSiteParser()
|
||||||
self.runner = FlowRunner(
|
self.runner = FlowRunner(
|
||||||
storage=self.mongo,
|
storage=self.mongo,
|
||||||
|
log_storage=self.log_repository,
|
||||||
variable_service=self.variable_service,
|
variable_service=self.variable_service,
|
||||||
)
|
)
|
||||||
self.http = requests.Session()
|
self.http = requests.Session()
|
||||||
@@ -56,7 +61,11 @@ class TemplateCrawlerApp:
|
|||||||
logger.info("Fetched XML location: %s", xml_location)
|
logger.info("Fetched XML location: %s", xml_location)
|
||||||
xml_payload = self._load_xml(xml_location)
|
xml_payload = self._load_xml(xml_location)
|
||||||
site = self.parser.parse(xml_payload)
|
site = self.parser.parse(xml_payload)
|
||||||
self.runner.run_site(site)
|
self.runner.run_site(
|
||||||
|
site,
|
||||||
|
xml_source=xml_location,
|
||||||
|
xml_content=xml_payload,
|
||||||
|
)
|
||||||
|
|
||||||
def _load_xml(self, location: str) -> str:
|
def _load_xml(self, location: str) -> str:
|
||||||
logger.debug("Downloading XML from %s", location)
|
logger.debug("Downloading XML from %s", location)
|
||||||
|
|||||||
@@ -2,14 +2,15 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
from typing import Any, Dict, Optional
|
from datetime import datetime
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
from urllib.parse import urljoin
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
from .actions.registry import ActionRegistry
|
from .actions.registry import ActionRegistry
|
||||||
from .browser import BrowserFactory, BrowserSession, BrowserError
|
from .browser import BrowserFactory, BrowserSession, BrowserError
|
||||||
from .extraction import Extractor
|
from .extraction import Extractor
|
||||||
from .models import FlowConfig, SiteConfig, SelectorMode, ExcelExtractConfig
|
from .models import FlowConfig, SiteConfig, SelectorMode, ExcelExtractConfig
|
||||||
from .storage import MongoRepository
|
from .storage import MongoLogRepository, MongoRepository
|
||||||
from .variables import VariableResolver, VariableService
|
from .variables import VariableResolver, VariableService
|
||||||
from .utils.selectors import is_xpath_selector
|
from .utils.selectors import is_xpath_selector
|
||||||
from .actions.base import ActionContext
|
from .actions.base import ActionContext
|
||||||
@@ -21,24 +22,47 @@ class FlowRunner:
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
storage: MongoRepository,
|
storage: MongoRepository,
|
||||||
|
log_storage: MongoLogRepository,
|
||||||
variable_service: VariableService,
|
variable_service: VariableService,
|
||||||
extractor: Optional[Extractor] = None,
|
extractor: Optional[Extractor] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.storage = storage
|
self.storage = storage
|
||||||
|
self.log_storage = log_storage
|
||||||
self.variable_service = variable_service
|
self.variable_service = variable_service
|
||||||
self.extractor = extractor or Extractor()
|
self.extractor = extractor or Extractor()
|
||||||
|
|
||||||
def run_site(self, site: SiteConfig) -> None:
|
def run_site(
|
||||||
|
self,
|
||||||
|
site: SiteConfig,
|
||||||
|
xml_source: Optional[str] = None,
|
||||||
|
xml_content: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
factory = BrowserFactory(site)
|
factory = BrowserFactory(site)
|
||||||
resolver = VariableResolver(self.variable_service)
|
resolver = VariableResolver(self.variable_service)
|
||||||
|
|
||||||
with factory.session() as session:
|
with factory.session() as session:
|
||||||
if site.login:
|
if site.login:
|
||||||
logger.info("Executing login flow for site %s", site.site_id)
|
logger.info("Executing login flow for site %s", site.site_id)
|
||||||
self._run_flow(site, site.login, session, resolver, is_login=True)
|
self._run_flow(
|
||||||
|
site,
|
||||||
|
site.login,
|
||||||
|
session,
|
||||||
|
resolver,
|
||||||
|
is_login=True,
|
||||||
|
xml_source=xml_source,
|
||||||
|
xml_content=xml_content,
|
||||||
|
)
|
||||||
for flow in site.flows:
|
for flow in site.flows:
|
||||||
logger.info("Executing flow %s for site %s", flow.flow_id, site.site_id)
|
logger.info("Executing flow %s for site %s", flow.flow_id, site.site_id)
|
||||||
self._run_flow(site, flow, session, resolver, is_login=False)
|
self._run_flow(
|
||||||
|
site,
|
||||||
|
flow,
|
||||||
|
session,
|
||||||
|
resolver,
|
||||||
|
is_login=False,
|
||||||
|
xml_source=xml_source,
|
||||||
|
xml_content=xml_content,
|
||||||
|
)
|
||||||
|
|
||||||
def _run_flow(
|
def _run_flow(
|
||||||
self,
|
self,
|
||||||
@@ -47,21 +71,54 @@ class FlowRunner:
|
|||||||
session: BrowserSession,
|
session: BrowserSession,
|
||||||
resolver: VariableResolver,
|
resolver: VariableResolver,
|
||||||
is_login: bool,
|
is_login: bool,
|
||||||
|
xml_source: Optional[str],
|
||||||
|
xml_content: Optional[str],
|
||||||
) -> None:
|
) -> None:
|
||||||
entry_url = self._resolve_entry(site, flow)
|
entry_url = self._resolve_entry(site, flow)
|
||||||
site_context = self._build_context(site, flow, entry_url)
|
site_context = self._build_context(site, flow, entry_url)
|
||||||
action_context = ActionContext(session, resolver, site_context)
|
action_context = ActionContext(session, resolver, site_context)
|
||||||
|
|
||||||
|
metadata = {
|
||||||
|
"entry": flow.entry or flow.metadata.get("url"),
|
||||||
|
"data_type": flow.data_type,
|
||||||
|
"unique_keys": flow.unique_keys.value,
|
||||||
|
"unique_columns": flow.unique_columns,
|
||||||
|
"is_login_flow": is_login,
|
||||||
|
}
|
||||||
|
log_handle = self.log_storage.start(
|
||||||
|
site,
|
||||||
|
flow,
|
||||||
|
is_login=is_login,
|
||||||
|
xml_source=xml_source,
|
||||||
|
xml_snapshot=xml_content,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
steps: List[Dict[str, Any]] = []
|
||||||
if entry_url:
|
if entry_url:
|
||||||
resolved_entry = resolver.resolve(entry_url, action_context.site_context) or entry_url
|
resolved_entry = resolver.resolve(entry_url, action_context.site_context) or entry_url
|
||||||
action_context.site_context["entry_url"] = resolved_entry
|
action_context.site_context["entry_url"] = resolved_entry
|
||||||
logger.debug("Flow %s navigating to %s", flow.flow_id, resolved_entry)
|
logger.debug("Flow %s navigating to %s", flow.flow_id, resolved_entry)
|
||||||
|
self._record_step(
|
||||||
|
steps,
|
||||||
|
event="goto",
|
||||||
|
url=resolved_entry,
|
||||||
|
)
|
||||||
session.goto(resolved_entry)
|
session.goto(resolved_entry)
|
||||||
|
|
||||||
|
records_saved = 0
|
||||||
|
pages_processed = 0
|
||||||
|
try:
|
||||||
for action_config in flow.actions:
|
for action_config in flow.actions:
|
||||||
action_cls = ActionRegistry.get(action_config.type)
|
action_cls = ActionRegistry.get(action_config.type)
|
||||||
action = action_cls(action_config)
|
action = action_cls(action_config)
|
||||||
logger.debug("Executing action %s", action_config.type)
|
logger.debug("Executing action %s", action_config.type)
|
||||||
|
self._record_step(
|
||||||
|
steps,
|
||||||
|
event="action",
|
||||||
|
action_type=action_config.type,
|
||||||
|
selector=action_config.selector,
|
||||||
|
mode=action_config.mode.value if action_config.mode else None,
|
||||||
|
)
|
||||||
action.execute(action_context)
|
action.execute(action_context)
|
||||||
|
|
||||||
if is_login:
|
if is_login:
|
||||||
@@ -75,34 +132,142 @@ class FlowRunner:
|
|||||||
selector_mode,
|
selector_mode,
|
||||||
timeout,
|
timeout,
|
||||||
)
|
)
|
||||||
except BrowserError:
|
self._record_step(
|
||||||
raise RuntimeError("Login verification selector not found.")
|
steps,
|
||||||
|
event="login_check",
|
||||||
|
selector=selector,
|
||||||
|
mode=selector_mode.value,
|
||||||
|
status="found",
|
||||||
|
)
|
||||||
|
except BrowserError as exc:
|
||||||
|
self._record_step(
|
||||||
|
steps,
|
||||||
|
event="login_check",
|
||||||
|
selector=selector,
|
||||||
|
mode=selector_mode.value,
|
||||||
|
status="missing",
|
||||||
|
)
|
||||||
|
raise RuntimeError("Login verification selector not found.") from exc
|
||||||
|
actions_count = self._count_actions(steps)
|
||||||
|
self.log_storage.mark_success(
|
||||||
|
log_handle,
|
||||||
|
steps=steps,
|
||||||
|
summary={"actions": actions_count},
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
if not flow.extract:
|
if not flow.extract:
|
||||||
if flow.excel_extract:
|
if flow.excel_extract:
|
||||||
|
self._record_step(
|
||||||
|
steps,
|
||||||
|
event="excel_extract",
|
||||||
|
pattern=flow.excel_extract.file_pattern,
|
||||||
|
directory=flow.excel_extract.directory,
|
||||||
|
)
|
||||||
self._handle_excel_extract(site, flow, session)
|
self._handle_excel_extract(site, flow, session)
|
||||||
|
actions_count = self._count_actions(steps)
|
||||||
|
self.log_storage.mark_success(
|
||||||
|
log_handle,
|
||||||
|
steps=steps,
|
||||||
|
summary={
|
||||||
|
"actions": actions_count,
|
||||||
|
"excel_extract": True,
|
||||||
|
},
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.info("Flow %s has no extract step; skipping storage.", flow.flow_id)
|
logger.info("Flow %s has no extract step; skipping storage.", flow.flow_id)
|
||||||
|
self._record_step(
|
||||||
|
steps,
|
||||||
|
event="no_extract",
|
||||||
|
)
|
||||||
|
actions_count = self._count_actions(steps)
|
||||||
|
self.log_storage.mark_success(
|
||||||
|
log_handle,
|
||||||
|
steps=steps,
|
||||||
|
summary={"actions": actions_count},
|
||||||
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
if flow.paginate and flow.paginate.selector:
|
if flow.paginate and flow.paginate.selector:
|
||||||
self._run_paginated(site, flow, session)
|
pagination_result = self._run_paginated(
|
||||||
|
site,
|
||||||
|
flow,
|
||||||
|
session,
|
||||||
|
steps,
|
||||||
|
)
|
||||||
|
records_saved = pagination_result.get("records", 0)
|
||||||
|
pages_processed = pagination_result.get("pages", 0)
|
||||||
else:
|
else:
|
||||||
records = self.extractor.extract(session.html(), flow.extract)
|
records = list(self.extractor.extract(session.html(), flow.extract))
|
||||||
self.storage.save_records(site, flow, records)
|
self._record_step(
|
||||||
|
steps,
|
||||||
|
event="extract",
|
||||||
|
records=len(records),
|
||||||
|
)
|
||||||
|
records_saved = self.storage.save_records(site, flow, records)
|
||||||
|
|
||||||
|
actions_count = self._count_actions(steps)
|
||||||
|
summary = {
|
||||||
|
"actions": actions_count,
|
||||||
|
"records_saved": records_saved,
|
||||||
|
"pages_processed": pages_processed,
|
||||||
|
}
|
||||||
|
metadata_updates = {
|
||||||
|
"records_saved": records_saved,
|
||||||
|
"pages_processed": pages_processed,
|
||||||
|
}
|
||||||
|
self.log_storage.mark_success(
|
||||||
|
log_handle,
|
||||||
|
steps=steps,
|
||||||
|
summary=summary,
|
||||||
|
metadata=metadata_updates,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
metadata_updates = {
|
||||||
|
"records_saved": records_saved,
|
||||||
|
"pages_processed": pages_processed,
|
||||||
|
}
|
||||||
|
self.log_storage.mark_error(
|
||||||
|
log_handle,
|
||||||
|
exc,
|
||||||
|
steps=steps,
|
||||||
|
metadata=metadata_updates,
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
except BaseException as exc: # noqa: BLE001
|
||||||
|
metadata_updates = {
|
||||||
|
"records_saved": records_saved,
|
||||||
|
"pages_processed": pages_processed,
|
||||||
|
}
|
||||||
|
self.log_storage.mark_aborted(
|
||||||
|
log_handle,
|
||||||
|
reason=exc.__class__.__name__,
|
||||||
|
steps=steps,
|
||||||
|
metadata=metadata_updates,
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
def _run_paginated(
|
def _run_paginated(
|
||||||
self,
|
self,
|
||||||
site: SiteConfig,
|
site: SiteConfig,
|
||||||
flow: FlowConfig,
|
flow: FlowConfig,
|
||||||
session: BrowserSession,
|
session: BrowserSession,
|
||||||
) -> None:
|
steps: List[Dict[str, Any]],
|
||||||
|
) -> Dict[str, int]:
|
||||||
page = 0
|
page = 0
|
||||||
|
total_records = 0
|
||||||
while True:
|
while True:
|
||||||
page += 1
|
page += 1
|
||||||
records = self.extractor.extract(session.html(), flow.extract)
|
records = list(self.extractor.extract(session.html(), flow.extract))
|
||||||
self.storage.save_records(site, flow, records)
|
saved = self.storage.save_records(site, flow, records)
|
||||||
|
total_records += saved
|
||||||
|
self._record_step(
|
||||||
|
steps,
|
||||||
|
event="extract",
|
||||||
|
page=page,
|
||||||
|
records=len(records),
|
||||||
|
saved=saved,
|
||||||
|
)
|
||||||
|
|
||||||
if flow.paginate.max_pages and page >= flow.paginate.max_pages:
|
if flow.paginate.max_pages and page >= flow.paginate.max_pages:
|
||||||
break
|
break
|
||||||
@@ -111,10 +276,18 @@ class FlowRunner:
|
|||||||
break
|
break
|
||||||
try:
|
try:
|
||||||
session.click(selector, flow.paginate.mode, timeout=5)
|
session.click(selector, flow.paginate.mode, timeout=5)
|
||||||
|
self._record_step(
|
||||||
|
steps,
|
||||||
|
event="paginate_click",
|
||||||
|
selector=selector,
|
||||||
|
mode=flow.paginate.mode.value,
|
||||||
|
page=page,
|
||||||
|
)
|
||||||
except BrowserError:
|
except BrowserError:
|
||||||
logger.info("Pagination stopped at page %s", page)
|
logger.info("Pagination stopped at page %s", page)
|
||||||
break
|
break
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
return {"pages": page, "records": total_records}
|
||||||
|
|
||||||
def _resolve_entry(self, site: SiteConfig, flow: FlowConfig) -> Optional[str]:
|
def _resolve_entry(self, site: SiteConfig, flow: FlowConfig) -> Optional[str]:
|
||||||
if flow.entry:
|
if flow.entry:
|
||||||
@@ -168,3 +341,19 @@ class FlowRunner:
|
|||||||
flow.flow_id,
|
flow.flow_id,
|
||||||
config.file_pattern,
|
config.file_pattern,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _record_step(
|
||||||
|
self,
|
||||||
|
steps: List[Dict[str, Any]],
|
||||||
|
event: str,
|
||||||
|
**payload: Any,
|
||||||
|
) -> None:
|
||||||
|
entry: Dict[str, Any] = {
|
||||||
|
"timestamp": datetime.utcnow().isoformat(),
|
||||||
|
"event": event,
|
||||||
|
}
|
||||||
|
entry.update({k: v for k, v in payload.items() if v is not None})
|
||||||
|
steps.append(entry)
|
||||||
|
|
||||||
|
def _count_actions(self, steps: List[Dict[str, Any]]) -> int:
|
||||||
|
return sum(1 for step in steps if step.get("event") == "action")
|
||||||
|
|||||||
@@ -3,37 +3,57 @@ from __future__ import annotations
|
|||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import traceback
|
||||||
|
import uuid
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
from typing import Any, Dict, Iterable, List, Optional
|
from typing import Any, Dict, Iterable, List, Optional
|
||||||
|
|
||||||
from pymongo import MongoClient, UpdateOne
|
from pymongo import MongoClient, UpdateOne
|
||||||
|
from pymongo.collection import Collection
|
||||||
|
from pymongo.database import Database
|
||||||
|
|
||||||
from .models import FlowConfig, SiteConfig, UniqueKeyMode
|
from .models import FlowConfig, SiteConfig, UniqueKeyMode
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TaskLogHandle:
|
||||||
|
log_id: str
|
||||||
|
started_at: datetime
|
||||||
|
|
||||||
|
|
||||||
class MongoRepository:
|
class MongoRepository:
|
||||||
def __init__(self, mongo_uri: str, database: str) -> None:
|
def __init__(self, mongo_uri: str, database: str) -> None:
|
||||||
self._client = MongoClient(mongo_uri)
|
self._client = MongoClient(mongo_uri)
|
||||||
self._db = self._client[database]
|
self._db = self._client[database]
|
||||||
|
|
||||||
|
@property
|
||||||
|
def client(self) -> MongoClient:
|
||||||
|
return self._client
|
||||||
|
|
||||||
|
@property
|
||||||
|
def database(self) -> Database:
|
||||||
|
return self._db
|
||||||
|
|
||||||
def save_records(
|
def save_records(
|
||||||
self,
|
self,
|
||||||
site: SiteConfig,
|
site: SiteConfig,
|
||||||
flow: FlowConfig,
|
flow: FlowConfig,
|
||||||
records: Iterable[Dict[str, Any]],
|
records: Iterable[Dict[str, Any]],
|
||||||
) -> None:
|
) -> int:
|
||||||
records_list = list(records)
|
records_list = list(records)
|
||||||
if not records_list:
|
if not records_list:
|
||||||
logger.info("Flow %s yielded no records", flow.flow_id)
|
logger.info("Flow %s yielded no records", flow.flow_id)
|
||||||
return
|
return 0
|
||||||
|
|
||||||
collection_name = f"{site.site_id}_{flow.flow_id}"
|
collection_name = f"{site.site_id}_{flow.flow_id}"
|
||||||
collection = self._db[collection_name]
|
collection = self._db[collection_name]
|
||||||
|
|
||||||
if flow.unique_keys is UniqueKeyMode.none:
|
if flow.unique_keys is UniqueKeyMode.none:
|
||||||
collection.insert_many(records_list)
|
collection.insert_many(records_list)
|
||||||
return
|
return len(records_list)
|
||||||
|
|
||||||
operations: List[UpdateOne] = []
|
operations: List[UpdateOne] = []
|
||||||
for record in records_list:
|
for record in records_list:
|
||||||
@@ -49,15 +69,20 @@ class MongoRepository:
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
matched = 0
|
||||||
|
upserted = 0
|
||||||
if operations:
|
if operations:
|
||||||
result = collection.bulk_write(operations, ordered=False)
|
result = collection.bulk_write(operations, ordered=False)
|
||||||
|
matched = result.matched_count
|
||||||
|
upserted = len(result.upserted_ids or [])
|
||||||
logger.info(
|
logger.info(
|
||||||
"Saved records for %s/%s: matched=%s upserted=%s",
|
"Saved records for %s/%s: matched=%s upserted=%s",
|
||||||
site.site_id,
|
site.site_id,
|
||||||
flow.flow_id,
|
flow.flow_id,
|
||||||
result.matched_count,
|
matched,
|
||||||
len(result.upserted_ids or []),
|
upserted,
|
||||||
)
|
)
|
||||||
|
return matched + upserted
|
||||||
|
|
||||||
def _unique_key(self, flow: FlowConfig, record: Dict[str, Any]) -> str:
|
def _unique_key(self, flow: FlowConfig, record: Dict[str, Any]) -> str:
|
||||||
if flow.unique_keys is UniqueKeyMode.custom and flow.unique_columns:
|
if flow.unique_keys is UniqueKeyMode.custom and flow.unique_columns:
|
||||||
@@ -73,3 +98,137 @@ class MongoRepository:
|
|||||||
if hasattr(obj, "isoformat"):
|
if hasattr(obj, "isoformat"):
|
||||||
return obj.isoformat()
|
return obj.isoformat()
|
||||||
return str(obj)
|
return str(obj)
|
||||||
|
|
||||||
|
|
||||||
|
class MongoLogRepository:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
client: MongoClient,
|
||||||
|
database: str,
|
||||||
|
collection: str = "task_logs",
|
||||||
|
) -> None:
|
||||||
|
self._db: Database = client[database]
|
||||||
|
self._collection: Collection = self._db[collection]
|
||||||
|
|
||||||
|
def start(
|
||||||
|
self,
|
||||||
|
site: SiteConfig,
|
||||||
|
flow: FlowConfig,
|
||||||
|
*,
|
||||||
|
is_login: bool,
|
||||||
|
xml_source: Optional[str],
|
||||||
|
xml_snapshot: Optional[str],
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> TaskLogHandle:
|
||||||
|
log_id = str(uuid.uuid4())
|
||||||
|
started_at = datetime.utcnow()
|
||||||
|
doc: Dict[str, Any] = {
|
||||||
|
"_id": log_id,
|
||||||
|
"site_id": site.site_id,
|
||||||
|
"flow_id": flow.flow_id,
|
||||||
|
"is_login": is_login,
|
||||||
|
"status": "start",
|
||||||
|
"started_at": started_at,
|
||||||
|
"xml_source": xml_source,
|
||||||
|
"xml_snapshot": self._truncate_xml(xml_snapshot),
|
||||||
|
"metadata": metadata or {},
|
||||||
|
"steps": [],
|
||||||
|
}
|
||||||
|
self._collection.insert_one(doc)
|
||||||
|
return TaskLogHandle(log_id=log_id, started_at=started_at)
|
||||||
|
|
||||||
|
def append_steps(self, handle: TaskLogHandle, steps: List[Dict[str, Any]]) -> None:
|
||||||
|
if not steps:
|
||||||
|
return
|
||||||
|
self._collection.update_one(
|
||||||
|
{"_id": handle.log_id},
|
||||||
|
{"$push": {"steps": {"$each": steps}}},
|
||||||
|
)
|
||||||
|
|
||||||
|
def mark_success(
|
||||||
|
self,
|
||||||
|
handle: TaskLogHandle,
|
||||||
|
steps: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
summary: Optional[Dict[str, Any]] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> None:
|
||||||
|
self._finalize(
|
||||||
|
handle,
|
||||||
|
status="success",
|
||||||
|
steps=steps,
|
||||||
|
summary=summary,
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
def mark_error(
|
||||||
|
self,
|
||||||
|
handle: TaskLogHandle,
|
||||||
|
exc: Exception,
|
||||||
|
steps: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> None:
|
||||||
|
error_info = {
|
||||||
|
"type": exc.__class__.__name__,
|
||||||
|
"message": str(exc),
|
||||||
|
"stack": traceback.format_exc(),
|
||||||
|
}
|
||||||
|
self._finalize(
|
||||||
|
handle,
|
||||||
|
status="error",
|
||||||
|
steps=steps,
|
||||||
|
metadata=metadata,
|
||||||
|
extra_fields={"error": error_info},
|
||||||
|
)
|
||||||
|
|
||||||
|
def mark_aborted(
|
||||||
|
self,
|
||||||
|
handle: TaskLogHandle,
|
||||||
|
reason: str,
|
||||||
|
steps: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> None:
|
||||||
|
self._finalize(
|
||||||
|
handle,
|
||||||
|
status="aborted",
|
||||||
|
steps=steps,
|
||||||
|
metadata=metadata,
|
||||||
|
extra_fields={"abort_reason": reason},
|
||||||
|
)
|
||||||
|
|
||||||
|
def _finalize(
|
||||||
|
self,
|
||||||
|
handle: TaskLogHandle,
|
||||||
|
*,
|
||||||
|
status: str,
|
||||||
|
steps: Optional[List[Dict[str, Any]]] = None,
|
||||||
|
summary: Optional[Dict[str, Any]] = None,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
extra_fields: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> None:
|
||||||
|
ended_at = datetime.utcnow()
|
||||||
|
duration_ms = int((ended_at - handle.started_at).total_seconds() * 1000)
|
||||||
|
update_fields: Dict[str, Any] = {
|
||||||
|
"status": status,
|
||||||
|
"ended_at": ended_at,
|
||||||
|
"duration_ms": duration_ms,
|
||||||
|
}
|
||||||
|
if steps is not None:
|
||||||
|
update_fields["steps"] = steps
|
||||||
|
if summary is not None:
|
||||||
|
update_fields["summary"] = summary
|
||||||
|
if metadata:
|
||||||
|
for key, value in metadata.items():
|
||||||
|
update_fields[f"metadata.{key}"] = value
|
||||||
|
if extra_fields:
|
||||||
|
update_fields.update(extra_fields)
|
||||||
|
self._collection.update_one(
|
||||||
|
{"_id": handle.log_id},
|
||||||
|
{"$set": update_fields},
|
||||||
|
)
|
||||||
|
|
||||||
|
def _truncate_xml(self, xml: Optional[str], limit: int = 50_000) -> Optional[str]:
|
||||||
|
if not xml:
|
||||||
|
return None
|
||||||
|
if len(xml) <= limit:
|
||||||
|
return xml
|
||||||
|
return xml[:limit] + f"\n<!-- truncated, original_length={len(xml)} -->"
|
||||||
|
|||||||
Reference in New Issue
Block a user