Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions packs/content-moderation-generic/_ns.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"name": "content-moderation-generic",
"status": "experimental",
"description": "**ALPHA** — classifies user-generated content (forum posts, comments, chat messages, reviews) for moderation action. NOT for classifying what a user asks the AI to generate (see nist-genai-12-risk or safety-filter for that). This pack reads the CONTENT ITSELF and routes it to the appropriate moderation action. Seven positive intents ordered by severity: violence_threat (remove + report), hate_speech (remove + escalate), sexual_content (remove or age-gate), harassment (remove + warn), misinformation (label or remove), spam (delete + rate-limit), self_promotion (warn + disclosure). Negative class clean_content passes moderation. Pair with emotion-detection to also read the poster's emotional state for escalation decisions.",
"default_threshold": 1.5,
"default_min_voting_tokens": 2,
"compliance_frameworks": [
"EU Digital Services Act (DSA) Art. 16 notice-and-action",
"US Section 230 CDA (immunity conditioned on good-faith moderation)"
],
"lexical_groups": [
{"kind": "morph", "lang": "en", "canonical": "harass", "variants": ["harass", "harasses", "harassing", "harassment", "harassed"]},
{"kind": "morph", "lang": "en", "canonical": "threaten", "variants": ["threaten", "threatens", "threatening", "threatened", "threat", "threats"]},
{"kind": "morph", "lang": "en", "canonical": "spam", "variants": ["spam", "spamming", "spammer", "spammed"]},
{"kind": "morph", "lang": "en", "canonical": "promote", "variants": ["promote", "promotes", "promoting", "promoted", "promotion"]},
{"kind": "morph", "lang": "en", "canonical": "advertise", "variants": ["advertise", "advertises", "advertising", "advertised", "advertisement"]},
{"kind": "morph", "lang": "en", "canonical": "solicit", "variants": ["solicit", "solicits", "soliciting", "solicited", "solicitation"]},
{"kind": "morph", "lang": "en", "canonical": "fabricate", "variants": ["fabricate", "fabricates", "fabricating", "fabricated", "fabrication"]},
{"kind": "morph", "lang": "en", "canonical": "mislead", "variants": ["mislead", "misleads", "misleading", "misled"]},
{"kind": "morph", "lang": "en", "canonical": "explicit", "variants": ["explicit", "explicitly"]},
{"kind": "morph", "lang": "en", "canonical": "hate", "variants": ["hate", "hates", "hating", "hateful", "hatred"]},
{"kind": "abbrev", "lang": "en", "canonical": "nsfw", "variants": ["nsfw"]},
{"kind": "abbrev", "lang": "en", "canonical": "dm", "variants": ["dm", "dms"]},
{"kind": "abbrev", "lang": "en", "canonical": "seo", "variants": ["seo"]}
],
"policy_overrides": [
{"_comment": "Click + link — classic spam CTA pattern",
"words": ["click", "link"], "intent": "spam", "bonus": 2.0},
{"_comment": "Free + money — spam bait signal",
"words": ["free", "money"], "intent": "spam", "bonus": 2.5},
{"_comment": "Follow + bio — undisclosed self-promotion signal",
"words": ["follow", "bio"], "intent": "self_promotion", "bonus": 2.0},
{"_comment": "Death + threat — highest severity violence signal",
"words": ["death", "threat"], "intent": "violence_threat", "bonus": 3.0},
{"_comment": "Explicit + NSFW — sexual content signal",
"words": ["explicit", "nsfw"], "intent": "sexual_content", "bonus": 2.5},
{"_comment": "Election + stolen — election misinformation signal",
"words": ["election", "stolen"], "intent": "misinformation", "bonus": 2.5},
{"_comment": "Vaccine + autism — health misinformation signal",
"words": ["vaccine", "autism"], "intent": "misinformation", "bonus": 2.5},
{"_comment": "Conspiracy + proof — misinformation framing signal",
"words": ["conspiracy", "proof"], "intent": "misinformation", "bonus": 2.0}
]
}
37 changes: 37 additions & 0 deletions packs/content-moderation-generic/clean_content.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"description": "Negative class — content passes moderation. Normal community participation: asking questions, sharing experiences, leaving constructive feedback, expressing disagreement respectfully, reporting bugs, celebrating milestones, or joining discussions on-topic. The large majority of content in healthy communities routes here. Absorbs false positives from the seven positive intents to avoid over-moderation. Recommended action: no moderation action — serve content normally.",
"guardrails": [],
"instructions": "",
"persona": "",
"phrases": {
"en": [
"great point I hadn't thought of that",
"thanks for sharing this information",
"has anyone else experienced this issue",
"I disagree but here is my reasoning",
"can someone help me understand this",
"this worked for me in my case",
"looking for recommendations on this topic",
"sharing my experience with this product",
"I found a solution to this problem",
"interesting perspective I will think about it",
"does anyone know how to fix this",
"welcome to the community glad you joined",
"leaving an honest review of my experience",
"question about the terms of service",
"reporting a bug I found in the app",
"constructive feedback on the recent update",
"celebrating a milestone in this community",
"asking for advice on this situation",
"sharing a relevant article on this topic",
"I agree with the previous comment",
"thank you for the helpful response",
"off topic but related question here",
"my honest opinion after using this product"
]
},
"schema": null,
"source": null,
"target": null,
"type": "action"
}
37 changes: 37 additions & 0 deletions packs/content-moderation-generic/harassment.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"description": "Targeted personal abuse directed at a specific individual — insults, humiliation, repeated unwanted contact, coordinated pile-ons, or implicit threats designed to drive someone out of a space. Distinct from hate_speech (which targets a group by protected characteristic) by the personal targeting: the victim is a named or identifiable individual. Recommended action: remove content + issue formal warning to sender + escalate if pattern of behaviour.",
"guardrails": [],
"instructions": "",
"persona": "",
"phrases": {
"en": [
"you are absolutely worthless and pathetic",
"nobody likes you and everyone knows it",
"you should quit and never come back",
"you are the worst person in here",
"everyone here hates you just leave",
"you are a disgrace to this community",
"stop posting your opinion is trash",
"people like you ruin everything here",
"your work is embarrassingly bad honestly",
"you don't belong in this space",
"keep talking and see what happens",
"you should be ashamed of yourself",
"no one wants your opinion here",
"sending repeated abusive messages to user",
"tagging someone to coordinate pile-on",
"mocking someone's personal appearance online",
"threatening to expose personal information",
"leaving abusive comment on every post",
"I know where you live remember that",
"coordinated abuse targeting single community member",
"calling someone slurs repeatedly in replies",
"you are such a pathetic failure",
"get out of this community you're toxic"
]
},
"schema": null,
"source": null,
"target": null,
"type": "action"
}
37 changes: 37 additions & 0 deletions packs/content-moderation-generic/hate_speech.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"description": "Content targeting a protected characteristic (race, ethnicity, religion, nationality, gender, sexual orientation, disability) with dehumanising language, calls for discrimination, stereotyping that reduces a group to criminal or inferior status, or conspiracy framing that blames a group for societal problems. Distinct from harassment (targeting an individual) by the group-level targeting. Recommended action: remove immediately + escalate to trust-and-safety + consider reporting to authorities if incitement threshold is met.",
"guardrails": [],
"instructions": "",
"persona": "",
"phrases": {
"en": [
"all people from that country are criminals",
"that religion is a dangerous cult",
"people of that race are inferior",
"immigrants are replacing our culture",
"that ethnic group controls everything secretly",
"that nationality should all be deported",
"mixed-race relationships are destroying society",
"disability is just an excuse for laziness",
"women don't belong in this profession",
"that group is subhuman and dangerous",
"racial slur targeting ethnic community posted",
"dehumanizing comparison applied to religious group",
"conspiracy blaming ethnic group for problems",
"call for discrimination against protected group",
"stereotype reducing group to criminal behavior",
"content denying rights based on religion",
"incitement against religious minority community",
"gender-based slur targeting person in thread",
"content promoting racial segregation ideology",
"group inferiority claim based on ethnicity",
"LGBTQ people are a threat to children",
"ethnic group blamed for economic problems",
"sexual orientation used to justify exclusion"
]
},
"schema": null,
"source": null,
"target": null,
"type": "action"
}
37 changes: 37 additions & 0 deletions packs/content-moderation-generic/misinformation.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"description": "False factual claims presented as truth — medical misinformation, election misinformation, dangerous pseudoscience, conspiracy theories framed as revealed fact, or fabricated quotes/statistics. Distinct from opinion or satire by the factual framing ('this is proven', 'they are hiding this', 'leaked document shows'). Recommended action: apply misinformation label + link to authoritative source; remove if content poses immediate public safety risk (e.g. dangerous health advice).",
"guardrails": [],
"instructions": "",
"persona": "",
"phrases": {
"en": [
"doctors don't want you to know this",
"mainstream media hiding this from you",
"vaccine causes autism proven by study",
"5G towers spread disease it is proven",
"the real truth they suppress online",
"scientists covering up this cure",
"election was stolen here is proof",
"this food cures cancer they hide it",
"government is secretly poisoning water supply",
"moon landing was faked evidence here",
"climate change is a manufactured hoax",
"miracle supplement cures all diseases fast",
"this leaked document proves conspiracy",
"they don't want you to see this",
"banned information removed from everywhere",
"natural remedy cures disease without medicine",
"false death toll statistics exposed here",
"chemtrails are real here is evidence",
"deep state operation confirmed by insider",
"fabricated statistics about crime rates posted",
"shadow government controls everything proof inside",
"this person actually died they covered it up",
"dangerous health advice presented as medical fact"
]
},
"schema": null,
"source": null,
"target": null,
"type": "action"
}
37 changes: 37 additions & 0 deletions packs/content-moderation-generic/self_promotion.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"description": "A community member promoting their own product, service, content, or affiliate link without adequate disclosure or in violation of community rules about self-promotion. Distinct from spam (automated/mass/irrelevant commercial posts) by being a single identifiable user promoting their own work. Many communities allow self-promotion in designated spaces or with disclosure — this intent flags the behaviour for rule-check, not automatic removal. Recommended action: issue disclosure warning or redirect to designated self-promotion channel; remove if undisclosed affiliate or if repeat violation.",
"guardrails": [],
"instructions": "",
"persona": "",
"phrases": {
"en": [
"check out my new YouTube channel",
"I just launched my product buy it",
"follow me for more content like this",
"link in bio for my services",
"hire me I'm a freelancer check profile",
"my newsletter has all the answers",
"shameless plug for my new course",
"just released my book buy here",
"visit my Etsy shop link below",
"I do this professionally contact me",
"subscribe to my Substack for more",
"my app does exactly this download it",
"check my profile for coaching services",
"DM me for my consulting rates",
"I sell this product here is link",
"my agency specializes in this hire us",
"not sponsored but here is my referral",
"join my Discord for exclusive content",
"follow for tips and my product launch",
"undisclosed affiliate link in recommendation",
"I wrote a blog post about this read it",
"promoting own service without disclosure in thread",
"self-advertising without following community rules"
]
},
"schema": null,
"source": null,
"target": null,
"type": "action"
}
37 changes: 37 additions & 0 deletions packs/content-moderation-generic/sexual_content.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"description": "Explicit sexual content posted in a context where it is prohibited — public channels, non-adult communities, content involving minors, or non-consensual sharing of intimate imagery. Distinct from the csam-ncmec pack (which covers child sexual abuse material specifically). Recommended action: remove immediately + issue warning; escalate to csam-ncmec pipeline if content involves minors; for NCII (non-consensual intimate imagery), preserve evidence and provide reporter resources.",
"guardrails": [],
"instructions": "",
"persona": "",
"phrases": {
"en": [
"explicit sexual content posted in thread",
"pornographic image shared in community",
"NSFW video posted in public channel",
"sexually explicit description in comment",
"adult content in non-adult community",
"soliciting sexual contact in public chat",
"explicit sexual roleplay request posted",
"nude image shared without subject consent",
"sexual solicitation directed at user",
"graphic sexual content in public post",
"explicit sexual language in community chat",
"inappropriate sexual comment on user profile",
"sending unsolicited sexual content to user",
"pornographic link posted in general channel",
"explicit sexual material in public group",
"adult material posted to underage community",
"graphic sexual description in product review",
"explicit content bypassing age verification",
"sexual extortion attempt in direct message",
"intimate image shared without consent",
"inappropriate sexual proposition in comment",
"sexual harassment disguised as compliment",
"NSFW content submitted to general forum"
]
},
"schema": null,
"source": null,
"target": null,
"type": "action"
}
37 changes: 37 additions & 0 deletions packs/content-moderation-generic/spam.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"description": "Unsolicited commercial content, repetitive low-quality posts, bulk link drops, or off-topic promotional material. Distinct from self_promotion (which is a single user promoting their own work without disclosure) by the mass/commercial character: often automated, keyword-stuffed, or irrelevant to the thread. Recommended action: delete + rate-limit sender + consider account review.",
"guardrails": [],
"instructions": "",
"persona": "",
"phrases": {
"en": [
"visit my website for free money",
"click here for amazing limited offer",
"earn hundreds daily working from home",
"DM me for guaranteed investment returns",
"buy followers cheap instant delivery",
"get rich quick join my group",
"free gift card click to claim",
"lose weight guaranteed in seven days",
"make money online no experience needed",
"best crypto signal group join now",
"promote your business to million users",
"casino bonus claim before it expires",
"flash sale ends tonight hurry up",
"referral code for extra bonus points",
"limited time offer click now",
"I made ten thousand dollars last month",
"affordable SEO services check my profile",
"not an ad but love this product",
"drop your link below let's connect",
"this product cured my condition fast",
"passive income opportunity sign up free",
"click link in bio for discount code",
"buy real Instagram followers instant results"
]
},
"schema": null,
"source": null,
"target": null,
"type": "action"
}
37 changes: 37 additions & 0 deletions packs/content-moderation-generic/violence_threat.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"description": "Explicit threats of physical harm against an individual or group, calls for others to commit violence, graphic descriptions of planned harm, or content that combines personal identifying information with threatening language. Highest severity category — potential law enforcement referral. Recommended action: remove immediately + preserve evidence + escalate to trust-and-safety + consider referral to law enforcement if credible threat.",
"guardrails": [],
"instructions": "",
"persona": "",
"phrases": {
"en": [
"I will hurt you if you post again",
"you'll regret posting this I promise",
"death threat directed at community member",
"I know where you work watch out",
"calling for violence against individual",
"graphic description of harming someone",
"you better stop or face consequences",
"I will find you and make you pay",
"calling for assault on group members",
"threat combined with personal information posted",
"encouraging others to harm specific person",
"inciting mob violence against individual",
"bomb threat posted in public forum",
"content glorifying mass violence event",
"instructions for harming specific target",
"explicit threat against named individual",
"stay away or something bad happens",
"calling for coordinated attack on person",
"posting someone's address with threatening message",
"advocating harm against community member",
"graphic violence fantasized about specific person",
"threatening someone's family members online",
"credible threat of physical violence posted"
]
},
"schema": null,
"source": null,
"target": null,
"type": "action"
}
Loading
Loading