News/RSS connector — aggregate headlines from RSS and Atom feeds.
Uses stdlib xml.etree.ElementTree for parsing (no extra dependencies).
Config file lists feeds to follow. All HTTP calls are in module-level
functions for easy mocking in tests.
NewsRSSConnector(*, config_path: str = _DEFAULT_CONFIG_PATH)
Bases: BaseConnector
Aggregate headlines from configured RSS/Atom feeds.
Source code in src/openjarvis/connectors/news_rss.py
| def __init__(self, *, config_path: str = _DEFAULT_CONFIG_PATH) -> None:
self._config_path = Path(config_path)
self._status = SyncStatus()
|
sync(*, since: Optional[datetime] = None, cursor: Optional[str] = None) -> Iterator[Document]
Yield Documents for recent items across all configured feeds.
Source code in src/openjarvis/connectors/news_rss.py
| def sync(
self, *, since: Optional[datetime] = None, cursor: Optional[str] = None
) -> Iterator[Document]:
"""Yield Documents for recent items across all configured feeds."""
feeds = self._load_config()
for feed in feeds:
feed_name = feed.get("name", "Unknown Feed")
feed_url = feed.get("url", "")
if not feed_url:
continue
try:
xml_text = _fetch_feed(feed_url)
except httpx.HTTPError:
continue
items = _parse_rss_items(xml_text)
for item in items:
pub_dt = _parse_pub_date(item["pubDate"])
# Filter by since if the date is parseable
if since and pub_dt and pub_dt.replace(tzinfo=None) < since:
continue
title = item["title"] or "Untitled"
doc_id = f"rss-{feed_name}-{title[:40]}"
yield Document(
doc_id=doc_id,
source="news_rss",
doc_type="article",
content=item["description"],
title=title,
timestamp=pub_dt or datetime.now(),
url=item["link"] or None,
metadata={"feed_name": feed_name},
)
self._status.state = "idle"
self._status.last_sync = datetime.now()
|