Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions agents.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Agents

## Success Criteria for Feed Processing

The fulltext RSS proxy must be able to process the following feeds without any issue, with full content retrieved, and essential fields kept:

### Test Feeds

1. **UN News RSS** — `https://news.un.org/feed/subscribe/en/news/all/rss.xml`
- Standard RSS 2.0 feed

2. **FreshRSS Atom** — `https://github.com/FreshRSS/FreshRSS/releases.atom`
- Atom feed format
- Author is in `<author><name>...</name></author>` (mapped to `item.author` by rss-parser, not `item.creator`)
- Entry ID is in `<id>` (mapped to `item.id`, not `item.guid`)

3. **AliasVault RSS** — `https://www.aliasvault.net/rss.xml`
- RSS 2.0 with `<dc:creator>` elements (Dublin Core namespace)
- Has categories per item
- `item.creator` maps directly from rss-parser

### Essential Fields

For each feed item in the output:

| Field | RSS Output | JSON Feed Output |
|-------|-----------|-----------------|
| Title | `<title>` | `title` |
| Link | `<link>` | `url` |
| Date | `<pubDate>` | `date_published` |
| Full content | `<content:encoded>` | `content_html` |
| Description | `<description>` | `summary` |
| Author/Creator | `<dc:creator>` | `authors[].name` |
| Categories | `<category>` | `tags` |
| GUID/ID | `<guid>` | `id` |

### Author Handling

- RSS feeds with `<dc:creator>`: mapped via `item.creator` from rss-parser
- Atom feeds with `<author>`: mapped via `item.author` from rss-parser (fallback when `item.creator` is absent)
- Output as `<dc:creator>` in RSS and `authors` array in JSON Feed
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"@types/cypress": "^1.1.3",
"@types/express": "^4.17.17",
"@types/jest": "^27.5.2",
"@types/node": "^22",
"@types/redis": "2.8.32",
"@types/xml2js": "^0.4.11",
"@whtsky/prettier-config": "^1.0.1",
Expand All @@ -31,15 +32,15 @@
"tailwindcss": "3.2.7",
"ts-jest": "^29.4.6",
"ts-node": "^10.9.1",
"typescript": "^4.9.5"
"typescript": "^5.0.0"
},
"dependencies": {
"@postlight/parser": "^2.2.3",
"@sentry/integrations": "^7.29.0",
"@sentry/node": "^7.29.0",
"crypto-js": "^4.1.1",
"express": "^4.18.2",
"feed": "^4.2.2",
"feedsmith": "^2.9.1",
"redis": "3.1.2",
"rss-parser": "^3.12.0"
},
Expand Down
123 changes: 83 additions & 40 deletions src/app.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import express from 'express'
import bodyParser from 'body-parser'
import Parser from 'rss-parser'
import { Feed } from 'feed'
import type { FeedOptions, Item } from 'feed/lib/typings'
import { generateRssFeed, generateJsonFeed } from 'feedsmith'
import * as Sentry from '@sentry/node'
import { RewriteFrames } from '@sentry/integrations'

Expand Down Expand Up @@ -48,60 +47,104 @@ app.use(express.static(constants.publicPath))

app.use(bodyParser.urlencoded({ extended: false }))

async function getFullTextFeed(feedUrl: string, maxItemsPerFeed: number) {
const parser = new Parser()
interface FeedData {
title: string
description?: string
link: string
image?: string
items: Array<{
title: string
link: string
date: Date
content?: string
description?: string
creator?: string
categories?: string[]
guid?: string
}>
}

async function getFullTextFeed(feedUrl: string, maxItemsPerFeed: number): Promise<FeedData> {
const parser = new Parser<{}, { author?: string; id?: string }>()
try {
const feed = await parser.parseURL(feedUrl)
const feedOptions: FeedOptions = {
...feed,
title: feed.title!,
description: feed.description,
link: feedUrl,
id: feedUrl,
image: feed.image?.url,
copyright: '',
}
const outputFeed = new Feed(feedOptions)

const newItems = await Promise.all((feed.items || []).filter(item => !!item.link).slice(0, maxItemsPerFeed).map(async item => {
const newItem: Item = {
...item,
title: item.title!,
link: item.link!,
date: new Date(item.pubDate!),
}
const items = await Promise.all((feed.items || []).filter(item => !!item.link).slice(0, maxItemsPerFeed).map(async item => {
let content: string | undefined = await cache.get(item.link!)
if (!content) {
content = (await parsePageUsingMercury(item.link!)).content
await cache.set(item.link!, content)
}
newItem.content = content
return newItem
return {
...item,
title: item.title!,
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we use ...item here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done in 0d10fee — now uses ...item spread with overrides for transformed fields (date, content, description).

link: item.link!,
date: new Date(item.pubDate || item.isoDate || Date.now()),
content,
description: item.contentSnippet || item.content,
creator: item.creator || item.author,
guid: item.guid || item.id,
}
}))
for (const newItem of newItems) {
outputFeed.addItem(newItem)

return {
title: feed.title!,
description: feed.description,
link: feedUrl,
image: feed.image?.url,
items,
}
return outputFeed
} catch (e) {
if (constants.sentryDsn) {
Sentry.captureException(e)
}
const outputFeed = new Feed({
id: `${feedUrl}-failed`,
return {
title: `Failed to get fulltext rss for ${feedUrl}.`,
copyright: '',
})
const errorItem: Item = {
title: `Failed to get fulltext rss for ${feedUrl}.`,
content: `Exception: ${e}`,
link: 'https://github.com/whtsky/fulltextrssplz/issues',
date: new Date(),
link: feedUrl,
items: [{
title: `Failed to get fulltext rss for ${feedUrl}.`,
content: `Exception: ${e}`,
link: 'https://github.com/whtsky/fulltextrssplz/issues',
date: new Date(),
}],
}
outputFeed.addItem(errorItem)
return outputFeed
}
}

function feedToRss(data: FeedData): string {
return generateRssFeed({
...data,
description: data.description || '',
image: data.image ? { url: data.image } : undefined,
items: data.items.map(item => ({
...item,
pubDate: item.date,
guid: item.guid ? { value: item.guid } : undefined,
dc: item.creator ? { creator: item.creator } : undefined,
content: item.content ? { encoded: item.content } : undefined,
categories: item.categories?.map(name => ({ name })),
})),
}, { lenient: true })
}

function feedToJson(data: FeedData): string {
return JSON.stringify(generateJsonFeed({
title: data.title,
home_page_url: data.link,
description: data.description,
items: data.items.map(item => ({
id: item.guid || item.link,
url: item.link,
title: item.title,
date_published: item.date,
content_html: item.content,
summary: item.description,
authors: item.creator ? [{ name: item.creator }] : undefined,
tags: item.categories,
})),
}, { lenient: true }))
}

app.get('/feed', async (req, res) => {
const feedUrl = req.query.url

Expand Down Expand Up @@ -145,18 +188,18 @@ app.get('/feed', async (req, res) => {
}
}

const outputFeed = await getFullTextFeed(feedUrl, maxItemsPerFeed)
const feedData = await getFullTextFeed(feedUrl, maxItemsPerFeed)

if (constants.cacheControlMaxAge > 0) {
res.set('Cache-control', `public, max-age=${constants.cacheControlMaxAge}`)
}

if (format == Format.RSS) {
res.set('Content-type', 'application/rss+xml;charset=UTF-8')
res.end(outputFeed.rss2())
res.end(feedToRss(feedData))
} else if (format == Format.JSON) {
res.set('Content-type', 'application/json;charset=UTF-8')
res.end(outputFeed.json1())
res.end(feedToJson(feedData))
} else {
res.end('unknown format:' + format)
}
Expand Down
Loading