diff --git a/.gitignore b/.gitignore index 705cdad..abdd45d 100644 --- a/.gitignore +++ b/.gitignore @@ -6,10 +6,7 @@ 2025-06-24_COSdata_validated.xlsx 2025-06-23_COSdata_validated.xlsx testing.R -<<<<<<< HEAD misc process_replicationnetwork_data.R 2025-10-16_COSdata_validated.xlsx -======= ~$* ->>>>>>> d40aa040e23190d65994a36b3c35bf9ba9afccce diff --git a/fred-data-validation/.gitignore b/fred-data-validation/.gitignore new file mode 100644 index 0000000..a4376bc --- /dev/null +++ b/fred-data-validation/.gitignore @@ -0,0 +1,4 @@ +fred_venv/ +fred_xml_output/ +fred_pdfs/ +**/**/__pycache__/ diff --git a/fred-data-validation/README.md b/fred-data-validation/README.md new file mode 100644 index 0000000..7127514 --- /dev/null +++ b/fred-data-validation/README.md @@ -0,0 +1,330 @@ +# FRED Data Validation Pipeline + +Automated validation pipeline for replication studies using LLMs, Crossref API, and GROBID for PDF processing. + +## Overview + +This project validates replication studies by: +1. **Reference Matching**: Comparing original study references (`ref_o`) with replication references (`ref_r`) to determine if the replication explicitly, implicitly, or unclearly addresses the original study +2. **Abstract Extraction**: Fetching abstracts from Crossref API or extracting from PDFs using GROBID when Crossref data is unavailable +3. **Central Claim Validation**: Using LLMs to determine if claims from original studies are central to the research based on the title and abstract + +## Features + +- **Parallel Processing**: Multi-threaded execution for efficient processing of large datasets +- **Multiple Abstract Sources**: Prioritizes Crossref API, falls back to PDF extraction via GROBID +- **LLM-Powered Validation**: Uses GPT models with structured output for consistent validation results +- **Comprehensive Logging**: Detailed progress tracking and error reporting +- **CSV Export**: Saves abstracts separately for reuse + +## Prerequisites + +### Required Software +- Python 3.12.2 or higher +- pip (Python package manager) + +### Required Accounts/APIs +- OpenAI API key (for GPT models) +- GROBID server access (default: https://kermitt2-grobid.hf.space/) + +## Installation + +### 1. Clone the Repository + +```bash +git clone +cd fred-data +``` + +### 2. Create Virtual Environment + +```bash +python3.12 -m venv fred_venv +source fred_venv/bin/activate # On macOS/Linux +# OR +# fred_venv\Scripts\activate # On Windows +``` + +### 3. Install Dependencies + +```bash +pip install -r requirements.txt +``` + +**Required packages:** +```txt +pandas +openpyxl +crossref-commons +grobid-client-python +langchain +langchain-openai +langchain-core +pydantic +``` + +### 4. Set Up API Keys + +Set your OpenAI API key as an environment variable: + +```bash +export OPENAI_API_KEY="your-api-key-here" +``` + +Or add it to your shell profile (~/.zshrc, ~/.bashrc): + +```bash +echo 'export OPENAI_API_KEY="your-api-key-here"' >> ~/.zshrc +source ~/.zshrc +``` + +## Project Structure + +``` +fred-data/ +├── requirements.txt # Python dependencies +├── README.md # This file +├── 2025-10-22_COSdata_validated.xlsx # Input data file +├── fred_mini.csv # Input data file with titles +├── fred_pdfs/ # Directory with PDF files +│ └── *.pdf # Original study PDFs +├── output/ # GROBID XML output +│ └── *.grobid.tei.xml # Extracted XML files +└── fred_venv/ # Python virtual environment +``` + +## Input Data Format + +The Excel file (`2025-10-22_COSdata_validated.xlsx`) should contain: + +### Required Columns +- `ref_o`: Original study reference (author, year, title) +- `ref_r`: Replication study reference +- `doi_o`: DOI of the original study +- `claim_text_o`: Claim text from the original study +- `file_o`: PDF filename (optional, for PDF processing) + +### Optional Columns +All other columns from your dataset will be preserved in the output. + +## Usage + +### Basic Usage + +```bash +python fred-data.py +``` + +### Processing Configuration + +Edit the script to adjust processing parameters: + +```python +# Number of parallel workers (adjust based on API rate limits) +max_workers = min(10, os.cpu_count() or 1) + +# Model selection +llm = ChatOpenAI( + model="gpt-5-mini", # or "gpt-4o-mini", "gpt-4", etc. + temperature=0.0 +) +``` + +### Test Mode + +To process only a subset of records for testing: + +```python +# Process only first 10 records +for index, row in df.iloc[:10].iterrows(): +``` + +## Output Files + +The script generates three output files: + +### 1. Main Results Excel File +**Filename**: `2025-10-22_COSdata_combined_validation_parallel.xlsx` + +**New Columns Added:** +- `reference_match`: Classification (explicit/implicit/unclear) +- `ref_match_confidence`: Confidence score (0.0-1.0) +- `ref_match_evidence`: Supporting evidence from text +- `ref_match_explanation`: Detailed reasoning +- `abstract_source`: Where abstract was found (crossref/pdf/none) +- `has_abstract`: Boolean flag +- `abstract_text`: Full abstract text +- `is_central_claim`: Whether claim is central to article +- `claim_confidence`: Confidence score (0.0-1.0) +- `claim_match_type`: How claim maps (exact/construct_mapping/peripheral/unclear) +- `claim_key_evidence`: Supporting evidence from abstract +- `claim_concerns`: Methodological concerns +- `claim_explanation`: Detailed reasoning + +### 2. Abstracts CSV +**Filename**: `2025-10-22_abstracts_parallel.csv` + +**Columns:** +- `doi_o`: DOI of the original study +- `abstract`: Full abstract text +- `source`: Source of abstract (crossref/pdf) + +### 3. Console Output +Real-time progress and statistics printed to console. + +## Validation Logic + +### Reference Match Classification + +1. **EXPLICIT**: `ref_r` contains author names AND/OR publication year from `ref_o` + - Example: "Replication of Finucane et al.'s (2000) study" + +2. **IMPLICIT**: `ref_r` mentions specific topic/construct from `ref_o` but not author/year + - Example: Original topic "implicit threat-related bias" → Replication "attention bias to social threat" + +3. **UNCLEAR**: `ref_r` doesn't mention authors/year or specific topic + - Example: Generic titles like "Many Labs 2" + +### Central Claim Validation + +Claims are evaluated as central if they: +- Target the main research question mentioned in title/abstract +- Map to core constructs described in the abstract +- Could be tested by methods consistent with the abstract +- Are emphasized in the title and/or abstract + +**Note**: Validation only occurs for non-explicit reference matches to save API calls. + +## Performance Optimization + +### Parallel Processing +- Uses `ThreadPoolExecutor` for concurrent API calls +- Default: 10 worker threads (configurable) +- Thread-safe statistics tracking with locks + +### Caching +- Checks for existing GROBID XML files before reprocessing PDFs +- Reuses abstracts from previous runs if XML files exist + +### API Rate Limiting +- Built-in delays between requests +- Configurable worker count to respect API limits +- Error handling and retry logic + +## Troubleshooting + +### Common Issues + +#### 1. Missing OpenAI API Key +``` +Error: OpenAI API key not set +``` +**Solution**: Export your API key as shown in Setup section + +#### 2. GROBID Processing Failures +``` +Error processing PDF: Connection timeout +``` +**Solution**: Check GROBID server status or use alternative server + +#### 3. Memory Issues with Large Datasets +``` +MemoryError +``` +**Solution**: Reduce `max_workers` or process in batches + +#### 4. Invalid JSON Output from LLM +``` +Error: Invalid json output +``` +**Solution**: Model may be returning malformed JSON. Check model compatibility or adjust prompt. + +### Debug Mode + +Enable verbose logging: + +```python +import logging +logging.basicConfig(level=logging.DEBUG) +``` + +## Advanced Configuration + +### Custom GROBID Server + +```python +client = GrobidClient(grobid_server="http://your-grobid-server:8070") +``` + +### Using Different LLM Models + +```python +# For cheaper processing +llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0) + +# For better accuracy +llm = ChatOpenAI(model="gpt-4o", temperature=0.0) + +# For Anthropic Claude +from langchain_anthropic import ChatAnthropic +llm = ChatAnthropic(model="claude-3-sonnet-20240229") +``` + +### Batch Processing + +Process data in chunks: + +```python +chunk_size = 100 +for i in range(0, len(df), chunk_size): + chunk_df = df.iloc[i:i+chunk_size] + # Process chunk_df +``` + +## Contributing + +### Code Style +- Follow PEP 8 +- Use type hints (Python 3.12+ syntax) +- Add docstrings to functions +- Keep functions focused and testable + +### Testing + +Run with a small subset first: + +```python +df = df.iloc[:10] # Test with 10 records +``` + +## License + +This project is open source and available under the MIT License. + +## Citation + +If you use this pipeline in your research, please cite: + +``` +[Add citation information] +``` + +## Support + +For issues, questions, or contributions, please create an issue on GitHub. + +## Acknowledgments + +- GROBID for PDF processing +- Crossref for metadata API +- OpenAI for LLM capabilities +- LangChain for LLM orchestration + +## Changelog + +### Version 1.0.0 (2025-10-23) +- Initial release +- Parallel processing implementation +- Reference matching validation +- Central claim validation +- Abstract extraction from Crossref and PDFs diff --git a/fred-data-validation/main.py b/fred-data-validation/main.py new file mode 100644 index 0000000..89be169 --- /dev/null +++ b/fred-data-validation/main.py @@ -0,0 +1,144 @@ +import os +from concurrent.futures import ThreadPoolExecutor, as_completed +from threading import Lock +import pandas as pd +from crossref.restful import Works, Etiquette +from grobid_client.grobid_client import GrobidClient +from validators.reference_validation import setup_reference_validation_chain +from validators.claim_validation import setup_central_claim_validation_chain +from utils.row_processor import process_single_row + +if __name__ == "__main__": + source_directory = 'fred_pdfs/' + my_etiquette = Etiquette('FRED_DATA', 'v.1.0', 'https://github.com/forrtproject/FReD-data', 'ksiziva+fredData@gmail.com') + works = Works(etiquette=my_etiquette) + df = pd.read_excel('2025-10-22_COSdata_validated.xlsx', sheet_name='Sheet 1') + + print(f"Total records: {len(df)}") + print(f"Columns: {df.columns.tolist()}") + + # Setup validation chains + print("\nSetting up validation chains...") + try: + reference_chain = setup_reference_validation_chain() + print("✓ Reference validation chain setup complete") + except Exception as e: + print(f"✗ Error setting up reference chain: {e}") + reference_chain = None + + try: + claim_chain = setup_central_claim_validation_chain() + print("✓ Central claim validation chain setup complete") + except Exception as e: + print(f"✗ Error setting up claim chain: {e}") + claim_chain = None + + # Add new columns for results + df['reference_match'] = '' + df['ref_match_confidence'] = 0.0 + df['ref_match_evidence'] = '' + df['ref_match_explanation'] = '' + df['abstract_source'] = '' + df['has_abstract'] = False + df['abstract_text'] = '' + df['is_central_claim'] = False + df['claim_confidence'] = 0.0 + df['claim_match_type'] = '' + df['claim_key_evidence'] = '' + df['claim_concerns'] = '' + df['claim_explanation'] = '' + + abstracts_data = [] + + if 'ref_o' not in df.columns or 'ref_r' not in df.columns: + print("Required columns 'ref_o' or 'ref_r' not found!") + print("Available columns:", df.columns.tolist()) + else: + client = GrobidClient(grobid_server="https://kermitt2-grobid.hf.space/") + stats_lock = Lock() + stats = { + 'validations_completed': 0, + 'claim_validations': 0, + 'with_abstract': 0, + 'abstracts_from_pdf': 0, + 'errors': 0 + } + + print(f"\nProcessing {len(df)} records in parallel...") + max_workers = min(10, os.cpu_count() or 1) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_row = { + executor.submit( + process_single_row, + (index, row), + reference_chain, + claim_chain, + works, + client, + source_directory + ): index + for index, row in df.iterrows() + } + + completed = 0 + for future in as_completed(future_to_row): + completed += 1 + index = future_to_row[future] + try: + result = future.result() + df.at[result['index'], 'reference_match'] = result['reference_match'] + df.at[result['index'], 'ref_match_confidence'] = result['ref_match_confidence'] + df.at[result['index'], 'ref_match_evidence'] = result['ref_match_evidence'] + df.at[result['index'], 'ref_match_explanation'] = result['ref_match_explanation'] + df.at[result['index'], 'abstract_source'] = result['abstract_source'] + df.at[result['index'], 'has_abstract'] = result['has_abstract'] + df.at[result['index'], 'abstract_text'] = result['abstract_text'] + df.at[result['index'], 'is_central_claim'] = result['is_central_claim'] + df.at[result['index'], 'claim_confidence'] = result['claim_confidence'] + df.at[result['index'], 'claim_match_type'] = result['claim_match_type'] + df.at[result['index'], 'claim_key_evidence'] = result['claim_key_evidence'] + df.at[result['index'], 'claim_concerns'] = result['claim_concerns'] + df.at[result['index'], 'claim_explanation'] = result['claim_explanation'] + if result['abstract_data']: + with stats_lock: + abstracts_data.append(result['abstract_data']) + with stats_lock: + if result['reference_match']: + stats['validations_completed'] += 1 + if result['is_central_claim'] or result['claim_confidence'] > 0: + stats['claim_validations'] += 1 + if result['abstract_source'] == 'crossref': + stats['with_abstract'] += 1 + elif result['abstract_source'] == 'pdf': + stats['abstracts_from_pdf'] += 1 + if result['error']: + stats['errors'] += 1 + except Exception as e: + print(f"Error retrieving result for row {index}: {e}") + with stats_lock: + stats['errors'] += 1 + + if completed % 10 == 0: + print(f"\nProgress: {completed}/{len(df)} records completed") + + output_filename = '2025-10-22_COSdata_combined_validation_parallel.xlsx' + try: + df.to_excel(output_filename, index=False) + print(f"\n✓ Results saved to: {output_filename}") + except Exception as e: + print(f"✗ Error saving results: {e}") + + if abstracts_data: + abstracts_df = pd.DataFrame(abstracts_data) + abstracts_csv_filename = '2025-10-22_abstracts_parallel.csv' + try: + abstracts_df.to_csv(abstracts_csv_filename, index=False, encoding='utf-8') + print(f"✓ Abstracts saved to: {abstracts_csv_filename}") + print(f" Total abstracts collected: {len(abstracts_df)}") + except Exception as e: + print(f"✗ Error saving abstracts CSV: {e}") + print("\nProcessing statistics:") + + for k, v in stats.items(): + print(f" {k}: {v}") diff --git a/fred-data-validation/requirements.txt b/fred-data-validation/requirements.txt new file mode 100644 index 0000000..e53e34d --- /dev/null +++ b/fred-data-validation/requirements.txt @@ -0,0 +1,7 @@ +pandas>=2.2.0 +openpyxl>=3.1.0 +crossrefapi>=1.7.0 +grobid-client-python>=0.0.10 +langchain-core>=1.0.0 +langchain-openai>=1.0.0 +pydantic>=2.2.0 diff --git a/fred-data-validation/utils/__init.py b/fred-data-validation/utils/__init.py new file mode 100644 index 0000000..e69de29 diff --git a/fred-data-validation/utils/row_processor.py b/fred-data-validation/utils/row_processor.py new file mode 100644 index 0000000..6636bc5 --- /dev/null +++ b/fred-data-validation/utils/row_processor.py @@ -0,0 +1,171 @@ +import os +import shutil +import tempfile +import traceback +import pandas as pd +from utils.xml_utils import check_abstract_in_xml +from validators.reference_validation import validate_reference_match +from validators.claim_validation import validate_central_claim + +def process_single_row( + row_data: tuple, + reference_chain, + claim_chain, + works, + client, + source_directory: str +) -> dict: + """ + Process a single row - designed to be run in parallel + Returns a dictionary with all results for this row + """ + index, row = row_data + result = { + 'index': index, + 'reference_match': '', + 'ref_match_confidence': 0.0, + 'ref_match_evidence': '', + 'ref_match_explanation': '', + 'abstract_source': '', + 'has_abstract': False, + 'abstract_text': '', + 'is_central_claim': False, + 'claim_confidence': 0.0, + 'claim_match_type': '', + 'claim_key_evidence': '', + 'claim_concerns': '', + 'claim_explanation': '', + 'abstract_data': None, + 'error': None + } + + temp_dir = None + + try: + print(f"[Row {index + 1}] Starting processing...") + + # STEP 1: Reference matching + if (pd.notna(row['ref_o']) and str(row['ref_o']).strip() and + pd.notna(row['ref_r']) and str(row['ref_r']).strip()): + + ref_o_text = str(row['ref_o']).strip() + ref_r_text = str(row['ref_r']).strip() + + if reference_chain: + reference_match, confidence, evidence, explanation = validate_reference_match( + reference_chain, ref_o_text, ref_r_text + ) + + result['reference_match'] = reference_match + result['ref_match_confidence'] = confidence + result['ref_match_evidence'] = evidence + result['ref_match_explanation'] = explanation + + print(f"[Row {index + 1}] Reference: {reference_match.upper()} ({confidence:.2f})") + + # STEP 2: If NOT explicit, fetch abstract and validate claim + if reference_match != "explicit": + abstract_text = "" + abstract_source = "none" + crossref_result = None + + # Try Crossref + try: + crossref_result = works.doi(row['doi_o']) + if 'abstract' in crossref_result and crossref_result['abstract'] and crossref_result['abstract'].strip(): + abstract_text = crossref_result['abstract'].strip() + abstract_source = "crossref" + print(f"[Row {index + 1}] Abstract from Crossref") + except Exception as crossref_error: + print(f"[Row {index + 1}] Crossref error: {crossref_error}") + + # Try PDF if no Crossref abstract + if (not abstract_text and 'file_o' in row and + pd.notna(row['file_o']) and str(row['file_o']).strip()): + + try: + pdf_filename = str(row['file_o']).strip() + xml_filename = pdf_filename.replace('.pdf', '.grobid.tei.xml') + xml_path = os.path.join('fred_xml_output', xml_filename) + + if os.path.exists(xml_path): + abstract_found, abstract_content = check_abstract_in_xml(xml_path) + if abstract_found: + abstract_text = abstract_content + abstract_source = "pdf" + print(f"[Row {index + 1}] Abstract from existing XML") + else: + temp_dir = tempfile.mkdtemp() + try: + source_file_path = os.path.join(source_directory, pdf_filename) + if os.path.exists(source_file_path): + temp_file_path = os.path.join(temp_dir, pdf_filename) + shutil.copy2(source_file_path, temp_file_path) + client.process("processFulltextDocument", temp_dir, "fred_xml_output") + + if os.path.exists(xml_path): + abstract_found, abstract_content = check_abstract_in_xml(xml_path) + if abstract_found: + abstract_text = abstract_content + abstract_source = "pdf" + print(f"[Row {index + 1}] Abstract from new XML") + finally: + if temp_dir and os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + temp_dir = None + except Exception as pdf_error: + print(f"[Row {index + 1}] PDF error: {pdf_error}") + + # Update results + result['abstract_source'] = abstract_source + result['has_abstract'] = bool(abstract_text) + result['abstract_text'] = abstract_text + + # Store abstract data for CSV + if abstract_text and 'doi_o' in row and pd.notna(row['doi_o']): + result['abstract_data'] = { + 'doi_o': str(row['doi_o']).strip(), + 'abstract': abstract_text, + 'source': abstract_source + } + + # Validate central claim + if (abstract_text and claim_chain and + pd.notna(row['claim_text_o']) and str(row['claim_text_o']).strip()): + + claim_text = str(row['claim_text_o']).strip() + + # Get title + title = "Title not available" + if crossref_result and 'title' in crossref_result: + if isinstance(crossref_result['title'], list): + title = crossref_result['title'][0] + else: + title = str(crossref_result['title']) + + is_central, claim_conf, match_type, claim_evidence, concerns, claim_explanation = validate_central_claim( + claim_chain, claim_text, abstract_text, title, row['doi_o'] + ) + + result['is_central_claim'] = is_central + result['claim_confidence'] = claim_conf + result['claim_match_type'] = match_type + result['claim_key_evidence'] = claim_evidence + result['claim_concerns'] = concerns + result['claim_explanation'] = claim_explanation + + status = "CENTRAL" if is_central else "NOT CENTRAL" + print(f"[Row {index + 1}] Claim: {status} ({match_type}, {claim_conf:.2f})") + else: + print(f"[Row {index + 1}] Explicit match - skipping claim validation") + + print(f"[Row {index + 1}] ✓ Complete") + + except Exception as e: + error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" + result['error'] = error_msg + print(f"[Row {index + 1}] ✗ {error_msg}") + if temp_dir and os.path.exists(temp_dir): + shutil.rmtree(temp_dir) + + return result diff --git a/fred-data-validation/utils/xml_utils.py b/fred-data-validation/utils/xml_utils.py new file mode 100644 index 0000000..27a7b29 --- /dev/null +++ b/fred-data-validation/utils/xml_utils.py @@ -0,0 +1,75 @@ +import xml.etree.ElementTree as ET + +def check_abstract_in_xml(xml_file_path): + """Check if XML contains abstract or summary content""" + try: + tree = ET.parse(xml_file_path) + root = tree.getroot() + + abstract_found = False + abstract_content = "" + + namespaces = {'tei': 'http://www.tei-c.org/ns/1.0'} + + for _, ns_uri in namespaces.items(): + xpath_patterns = [ + f".//{{{ns_uri}}}abstract", + f".//{{{ns_uri}}}div[@type='abstract']", + f".//{{{ns_uri}}}div[@subtype='abstract']", + f".//{{{ns_uri}}}summary", + f".//{{{ns_uri}}}div[@type='summary']" + ] + for pattern in xpath_patterns: + elements = root.findall(pattern) + for element in elements: + text_content = get_element_text(element) + if text_content and text_content.strip(): + abstract_found = True + abstract_content = text_content.strip() + break + if abstract_found: + break + if abstract_found: + break + + if not abstract_found: + xpath_patterns = [ + ".//abstract", + ".//div[@type='abstract']", + ".//div[@subtype='abstract']", + ".//summary", + ".//div[@type='summary']" + ] + for pattern in xpath_patterns: + elements = root.findall(pattern) + for element in elements: + text_content = get_element_text(element) + if text_content and text_content.strip(): + abstract_found = True + abstract_content = text_content.strip() + break + if abstract_found: + break + + return abstract_found, abstract_content + + except Exception as e: + print(f"Error parsing XML {xml_file_path}: {e}") + return False, "" + + +def get_element_text(element): + """Recursively extract all text content from an element and its children""" + text_parts = [] + + if element.text: + text_parts.append(element.text.strip()) + + for child in element: + child_text = get_element_text(child) + if child_text: + text_parts.append(child_text) + if child.tail: + text_parts.append(child.tail.strip()) + + return ' '.join(filter(None, text_parts)) diff --git a/fred-data-validation/validators/__init__.py b/fred-data-validation/validators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fred-data-validation/validators/claim_validation.py b/fred-data-validation/validators/claim_validation.py new file mode 100644 index 0000000..40e6c70 --- /dev/null +++ b/fred-data-validation/validators/claim_validation.py @@ -0,0 +1,71 @@ +from typing import Tuple +from pydantic import BaseModel, Field +from langchain_openai import ChatOpenAI +from langchain_core.prompts import ChatPromptTemplate + +class CentralClaimValidation(BaseModel): + """Structure for central claim validation results""" + is_central_claim: bool = Field(description="Whether the claim is central") + confidence: float = Field(ge=0.0, le=1.0) + match_type: str = Field(description="How the claim maps: 'exact', 'construct_mapping', 'peripheral', or 'unclear'") + key_evidence: str = Field(description="Key sentence(s) from the title/abstract that support or contradict centrality") + concerns: str = Field(description="Methodological or mapping concerns (if any)") + explanation: str = Field(description="Detailed reasoning for the decision") + + +def setup_central_claim_validation_chain(): + """Setup LangChain for central claim validation""" + llm = ChatOpenAI(model="gpt-5-mini", temperature=0.0) + llm_structured = llm.with_structured_output(CentralClaimValidation) + + prompt = ChatPromptTemplate.from_messages([ + ("system", """You are an expert in research methodology. + Your task: Decide whether the provided claim from an original study is a CENTRAL CLAIM of that article based on the TITLE and ABSTRACT. + + Definition — Central Claim: + - Central claims are the MAIN research questions or PRIMARY FINDINGS that the article emphasizes + - They are usually referenced in the TITLE and explicitly mentioned in the ABSTRACT + - The abstract usually already mentions results regarding these central claims + - Central claims can be tested with specific methods + + Evaluation Guidance: + - Focus ONLY on TITLE and ABSTRACT + - Be conservative: Return true ONLY when there is clear alignment + + Output: Return ONLY a valid JSON object matching the schema. No extra text."""), + + ("human", """ + ORIGINAL STUDY: + Title: {title} + DOI: {doi} + Abstract: {abstract} + + CLAIM (from original paper): {claim} + + Task: Is this claim a central claim of the article based on title and abstract? + Return only the JSON object.""") + ]) + + chain = prompt | llm_structured + return chain + + +def validate_central_claim( + chain, claim: str, abstract: str, + title: str, doi: str + ) -> Tuple[bool, float, str, str, str, str]: + """Validate if claim is central to the article""" + import json + try: + raw_result = chain.invoke({"claim": claim, "abstract": abstract, "title": title, "doi": doi}) + if isinstance(raw_result, CentralClaimValidation): + parsed = raw_result + elif isinstance(raw_result, dict): + parsed = CentralClaimValidation.parse_obj(raw_result) + elif isinstance(raw_result, str): + parsed = CentralClaimValidation.parse_obj(json.loads(raw_result)) + else: + parsed = CentralClaimValidation.parse_obj(dict(raw_result)) + return parsed.is_central_claim, parsed.confidence, parsed.match_type, parsed.key_evidence, parsed.concerns, parsed.explanation + except Exception as e: + return False, 0.0, "error", f"Error: {e}", "Processing error", "" diff --git a/fred-data-validation/validators/reference_validation.py b/fred-data-validation/validators/reference_validation.py new file mode 100644 index 0000000..7d40e8d --- /dev/null +++ b/fred-data-validation/validators/reference_validation.py @@ -0,0 +1,64 @@ +from typing import Tuple +from pydantic import BaseModel, Field +from langchain_openai import ChatOpenAI +from langchain_core.prompts import ChatPromptTemplate + +class ReferenceMatchValidation(BaseModel): + """Structure for reference matching validation results""" + reference_match: str = Field(description="Classification: 'explicit', 'implicit', or 'unclear'") + confidence: float = Field(ge=0.0, le=1.0) + key_evidence: str = Field(description="Specific text from ref_r that supports the classification") + explanation: str = Field(description="Detailed reasoning for the classification decision") + + +def setup_reference_validation_chain(): + """Setup LangChain for reference matching validation""" + llm = ChatOpenAI(model="gpt-5-mini", temperature=0.0) + llm_structured = llm.with_structured_output(ReferenceMatchValidation) + + prompt = ChatPromptTemplate.from_messages([ + ("system", """You are an expert in academic citation analysis. + Your task: Determine if ref_r (replication reference) clearly indicates that it addresses ref_o (original reference). + + Classification rules: + 1. EXPLICIT: The title/text of ref_r contains the author name(s) AND/OR publication year from ref_o. + 2. IMPLICIT: The title/text of ref_r does NOT mention author/year BUT does contain the specific, unambiguous topic, effect name, or key construct from ref_o. + 3. UNCLEAR: The title/text of ref_r does NOT mention authors/year AND does NOT mention the specific topic from ref_o. + + Be conservative: only classify as explicit/implicit when there is clear textual evidence. + + Output: Return ONLY a valid JSON object matching the schema. No extra text."""), + + ("human", """ + ORIGINAL REFERENCE (ref_o): + {ref_o} + + REPLICATION REFERENCE (ref_r): + {ref_r} + + Task: Does ref_r's text clearly indicate it addresses ref_o? + + Classify as: 'explicit', 'implicit', or 'unclear' + Return only the JSON object.""") + ]) + + chain = prompt | llm_structured + return chain + + +def validate_reference_match(chain, ref_o: str, ref_r: str) -> Tuple[str, float, str, str]: + """Validate if ref_r clearly addresses ref_o""" + import json + try: + raw_result = chain.invoke({"ref_o": ref_o, "ref_r": ref_r}) + if isinstance(raw_result, ReferenceMatchValidation): + parsed = raw_result + elif isinstance(raw_result, dict): + parsed = ReferenceMatchValidation.parse_obj(raw_result) + elif isinstance(raw_result, str): + parsed = ReferenceMatchValidation.parse_obj(json.loads(raw_result)) + else: + parsed = ReferenceMatchValidation.parse_obj(dict(raw_result)) + return parsed.reference_match, parsed.confidence, parsed.key_evidence, parsed.explanation + except Exception as e: + return "error", 0.0, f"Error: {e}", "Processing error"