Files
xspider/xspider/app.py
2025-10-17 17:26:46 +08:00

74 lines
2.3 KiB
Python

from __future__ import annotations
import logging
import sys
import time
from typing import Optional
import requests
from .redis_queue import RedisConfigQueue
from .runner import FlowRunner
from .settings import Settings
from .storage import MongoRepository
from .variables import VariableService
from .xml_parser import XMLSiteParser
logger = logging.getLogger(__name__)
class TemplateCrawlerApp:
def __init__(self, settings: Optional[Settings] = None) -> None:
self.settings = settings or Settings.from_env()
self.queue = RedisConfigQueue(
self.settings.redis_url,
self.settings.redis_list_key,
timeout=self.settings.redis_block_timeout,
)
self.mongo = MongoRepository(
self.settings.mongo_uri,
self.settings.mongo_database,
)
self.variable_service = VariableService(self.settings.variable_service_url)
self.parser = XMLSiteParser()
self.runner = FlowRunner(
storage=self.mongo,
variable_service=self.variable_service,
)
self.http = requests.Session()
def run_forever(self) -> None:
logger.info("Template crawler started. Waiting for XML configurations...")
while True:
try:
self._iterate()
except KeyboardInterrupt:
logger.info("Received interrupt; shutting down.")
break
except Exception: # noqa: BLE001
logger.exception("Unexpected error during iteration.")
time.sleep(3)
def _iterate(self) -> None:
xml_location = self.queue.fetch()
if not xml_location:
return
logger.info("Fetched XML location: %s", xml_location)
xml_payload = self._load_xml(xml_location)
site = self.parser.parse(xml_payload)
self.runner.run_site(site)
def _load_xml(self, location: str) -> str:
logger.debug("Downloading XML from %s", location)
response = self.http.get(location, timeout=30)
response.raise_for_status()
return response.text
def configure_logging() -> None:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
stream=sys.stdout,
)