From bd0a27f2ef6dce6a4293b11d26d9eb4adafe2945 Mon Sep 17 00:00:00 2001 From: muddybootscode Date: Thu, 19 Feb 2026 13:47:01 -0600 Subject: [PATCH] feat: add trust graph fetcher for social trust network generation Add TrustGraphFetcher that generates directed social trust graphs with realistic topology including community structure (SBM), preferential attachment, configurable reciprocity, organic distrust edges, and adversarial bot clusters. Optimized with numpy/scipy for scalability to 100k+ nodes. - New fetcher: graphfaker/fetchers/trust.py (683 lines) - 34 unit tests covering all pipeline stages - CLI integration with full parameter exposure - Core dispatcher wired for source="trust" - numpy added as explicit dependency - Fixed cli.py logger import (was importing from venv) - Fixed error message in generate_graph() to list all sources Co-Authored-By: Claude Opus 4.6 --- .gitignore | 6 + graphfaker/__init__.py | 3 +- graphfaker/cli.py | 50 ++- graphfaker/core.py | 55 ++- graphfaker/enums.py | 1 + graphfaker/fetchers/trust.py | 683 +++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + tests/test_fetchers_trust.py | 355 ++++++++++++++++++ 8 files changed, 1149 insertions(+), 5 deletions(-) create mode 100644 graphfaker/fetchers/trust.py create mode 100644 tests/test_fetchers_trust.py diff --git a/.gitignore b/.gitignore index beead43..47d9407 100644 --- a/.gitignore +++ b/.gitignore @@ -109,3 +109,9 @@ ENV/ .idea/ cache/ + +# Internal project notes (not for distribution) +docs/internal/ + +# Local workspace / scratch +workspace/ diff --git a/graphfaker/__init__.py b/graphfaker/__init__.py index d6a382d..b938835 100644 --- a/graphfaker/__init__.py +++ b/graphfaker/__init__.py @@ -5,7 +5,8 @@ __version__ = "0.2.0" from .core import GraphFaker +from .fetchers.trust import TrustGraphFetcher from .fetchers.wiki import WikiFetcher from .logger import configure_logging, logger -__all__ = ["GraphFaker", "logger", "configure_logging", "add_file_logging"] +__all__ = ["GraphFaker", "TrustGraphFetcher", "logger", "configure_logging", "add_file_logging"] diff --git a/graphfaker/cli.py b/graphfaker/cli.py index d6966cc..be24c28 100644 --- a/graphfaker/cli.py +++ b/graphfaker/cli.py @@ -2,14 +2,16 @@ Command-line interface for GraphFaker. """ -from venv import logger +import os + import typer + from graphfaker.core import GraphFaker from graphfaker.enums import FetcherType from graphfaker.fetchers.osm import OSMGraphFetcher from graphfaker.fetchers.flights import FlightGraphFetcher +from graphfaker.logger import logger from graphfaker.utils import parse_date_range -import os app = typer.Typer() @@ -52,6 +54,32 @@ def gen( help="Year, Month and day range (YYYY-MM-DD,YYYY-MM-DD) for flight data. e.g. '2024-01-01,2024-01-15'.", ), + # for FetcherType.TRUST source + total_users: int = typer.Option( + 10000, help="Number of user nodes for trust graph." + ), + avg_trust_links: int = typer.Option( + 15, help="Average outgoing trust edges per user." + ), + reciprocity: float = typer.Option( + 0.7, help="Fraction of trust edges that are mutual (0.0 to 1.0)." + ), + num_communities: int = typer.Option( + None, help="Number of community clusters. Defaults to sqrt(total_users)." + ), + community_mixing: float = typer.Option( + 0.15, help="Fraction of edges crossing community boundaries (0.0 to 1.0)." + ), + avg_distrust_links: float = typer.Option( + 2.0, help="Average DISTRUSTS edges per user (Poisson-distributed)." + ), + bot_fraction: float = typer.Option( + 0.10, help="Fraction of nodes that are bots (0.0 to 1.0)." + ), + seed: int = typer.Option( + None, help="Random seed for reproducibility." + ), + # common export: str = typer.Option("graph.graphml", help="File path to export GraphML"), ): @@ -83,7 +111,23 @@ def gen( logger.info( f"Fetched OSM graph with {g.number_of_nodes()} nodes and {g.number_of_edges()} edges." ) - else: + elif fetcher == FetcherType.TRUST: + g = gf.generate_graph( + source="trust", + total_users=total_users, + avg_trust_links=avg_trust_links, + reciprocity=reciprocity, + num_communities=num_communities, + community_mixing=community_mixing, + avg_distrust_links=avg_distrust_links, + bot_fraction=bot_fraction, + seed=seed, + ) + logger.info( + f"Generated trust graph with {g.number_of_nodes()} nodes and {g.number_of_edges()} edges." + ) + + elif fetcher == FetcherType.FLIGHTS: # Flight fetcher parsed_date_range = parse_date_range(date_range) if date_range else None diff --git a/graphfaker/core.py b/graphfaker/core.py index 9aa69a8..6c80a9d 100644 --- a/graphfaker/core.py +++ b/graphfaker/core.py @@ -9,6 +9,7 @@ from faker import Faker from graphfaker.fetchers.osm import OSMGraphFetcher from graphfaker.fetchers.flights import FlightGraphFetcher +from graphfaker.fetchers.trust import TrustGraphFetcher from graphfaker.logger import logger fake = Faker() @@ -289,6 +290,35 @@ def _generate_faker(self, total_nodes=100, total_edges=1000): self.generate_edges(total_edges=total_edges) return self.G + def _generate_trust( + self, + total_users: int = 10000, + avg_trust_links: int = 15, + reciprocity: float = 0.7, + num_communities: Optional[int] = None, + community_mixing: float = 0.15, + avg_distrust_links: float = 2.0, + bot_fraction: float = 0.10, + seed: Optional[int] = None, + ): + """Generate a directed social trust graph via TrustGraphFetcher.""" + try: + G = TrustGraphFetcher.build_graph( + total_users=total_users, + avg_trust_links=avg_trust_links, + reciprocity=reciprocity, + num_communities=num_communities, + community_mixing=community_mixing, + avg_distrust_links=avg_distrust_links, + bot_fraction=bot_fraction, + seed=seed, + ) + self.G = G + return G + except Exception as e: + logger.error(f"Failed to generate trust graph: {e}") + raise + def generate_graph( self, source: str = "faker", @@ -305,6 +335,15 @@ def generate_graph( year: int = 2024, month: int = 1, date_range: Optional[tuple] = None, + # Trust graph parameters + total_users: int = 10000, + avg_trust_links: int = 15, + reciprocity: float = 0.7, + num_communities: Optional[int] = None, + community_mixing: float = 0.15, + avg_distrust_links: float = 2.0, + bot_fraction: float = 0.10, + seed: Optional[int] = None, ) -> nx.DiGraph: """ Unified entrypoint: choose 'random' or 'osm'. @@ -338,8 +377,22 @@ def generate_graph( month=month, date_range=date_range, ) + elif source == "trust": + return self._generate_trust( + total_users=total_users, + avg_trust_links=avg_trust_links, + reciprocity=reciprocity, + num_communities=num_communities, + community_mixing=community_mixing, + avg_distrust_links=avg_distrust_links, + bot_fraction=bot_fraction, + seed=seed, + ) else: - raise ValueError(f"Unknown source '{source}'. Use 'random' or 'osm'.") + raise ValueError( + f"Unknown source '{source}'. " + f"Use 'faker', 'osm', 'flights', or 'trust'." + ) def export_graph(self, G: nx.Graph = None, source: str = None, path: str = "graph.graphml"): """ diff --git a/graphfaker/enums.py b/graphfaker/enums.py index 4b3e845..c193697 100644 --- a/graphfaker/enums.py +++ b/graphfaker/enums.py @@ -7,3 +7,4 @@ class FetcherType(str, Enum): OSM = "osm" FLIGHTS = "flights" FAKER = "faker" + TRUST = "trust" diff --git a/graphfaker/fetchers/trust.py b/graphfaker/fetchers/trust.py new file mode 100644 index 0000000..524b7ee --- /dev/null +++ b/graphfaker/fetchers/trust.py @@ -0,0 +1,683 @@ +""" +Trust graph fetcher for generating realistic directed social trust graphs. + +Produces homogeneous User→User graphs with configurable topology: +- Power-law degree distribution (Barabási-Albert preferential attachment) +- Community structure (Stochastic Block Model) +- Configurable reciprocity (directed trust links) +- Distrust links (for compromised account simulation) +- Small-world shortcuts (Watts-Strogatz rewiring) +""" + +import math +import random +import secrets +from typing import Optional + +import networkx as nx +import numpy as np +from faker import Faker + +from graphfaker.logger import logger + +fake = Faker() + + +class TrustGraphFetcher: + """Generates realistic directed social trust graphs. + + All methods are static — the class acts as a namespace, + consistent with the existing fetcher pattern (OSMGraphFetcher, + FlightGraphFetcher). + """ + + @staticmethod + def generate_users( + G: nx.DiGraph, + total_users: int, + community_labels: dict, + ) -> None: + """Add User nodes with Faker-generated attributes. + + Args: + G: Target directed graph. + total_users: Number of user nodes to create. + community_labels: Mapping of node index -> community ID. + """ + # Direct dict update is faster than G.add_node() for existing nodes + node_data = G.nodes + for i in range(total_users): + node_id = f"user_{i}" + node_data[node_id].update( + type="User", + name=fake.name(), + public_key=secrets.token_hex(32), + created_at=str(fake.date_time_between(start_date="-3y")), + community=community_labels.get(i, 0), + is_bot=False, + is_compromised=False, + ) + + @staticmethod + def _build_community_sizes( + total_users: int, + num_communities: Optional[int], + seed: Optional[int] = None, + min_community_size: int = 50, + ) -> list: + """Compute community sizes for the SBM using a log-normal distribution. + + Draws sizes from a log-normal distribution to produce a realistic + mix of a few large communities and many smaller ones, then rounds + and clamps to guarantee each community has at least + ``min_community_size`` members. + + Args: + total_users: Total number of users. + num_communities: Number of communities. If None, defaults to ~500 users per community. + seed: Random seed for reproducibility. + min_community_size: Floor for every community (default 50). + + Returns: + List of community sizes summing to total_users. + """ + if num_communities is None: + num_communities = max(2, round(total_users / 500)) + num_communities = min(num_communities, total_users) + + # For very small graphs where log-normal doesn't make sense, + # fall back to equal partitioning + if num_communities <= 1 or total_users < num_communities * min_community_size: + base_size = total_users // num_communities + sizes = [base_size] * num_communities + sizes[-1] += total_users - sum(sizes) + return sizes + + rng = np.random.default_rng(seed) + raw = rng.lognormal(mean=0.0, sigma=1.0, size=num_communities) + raw = raw / raw.sum() # normalize to fractions + + # Scale to total_users and round + sizes = [max(min_community_size, int(round(f * total_users))) for f in raw] + + # Fix the total: redistribute difference to the largest community + diff = total_users - sum(sizes) + largest_idx = sizes.index(max(sizes)) + sizes[largest_idx] += diff + + # Safety: if adjustment pushed largest below floor, redistribute + if sizes[largest_idx] < min_community_size: + sizes[largest_idx] = min_community_size + diff = total_users - sum(sizes) + # spread across all communities proportionally + for i in range(abs(diff)): + sizes[i % num_communities] += 1 if diff > 0 else -1 + + return sizes + + @staticmethod + def _build_probability_matrix( + sizes: list, + avg_trust_links: int, + community_mixing: float, + total_users: int, + ) -> list: + """Build the SBM connection probability matrix. + + Partitions the target average degree into within-community and + between-community contributions using community_mixing as the + fraction of total edges that cross community boundaries. + + With variable community sizes, each diagonal block gets its own + within-community probability. Off-diagonal blocks use a single + uniform between-community probability to keep the matrix symmetric + (required by NetworkX's undirected SBM). + + Args: + sizes: List of community sizes (one per community). + avg_trust_links: Target average degree. + community_mixing: Fraction of total edges that are between communities (0.0 to 1.0). + total_users: Total number of users. + + Returns: + len(sizes) x len(sizes) symmetric probability matrix. + """ + num_communities = len(sizes) + within_per_node = (1.0 - community_mixing) * avg_trust_links + between_per_node = community_mixing * avg_trust_links + + # Between-community: use mean community size for a single symmetric value + mean_size = total_users / num_communities + p_between = min(1.0, between_per_node / max(1, total_users - mean_size)) + + p_matrix = [] + for i in range(num_communities): + row = [] + p_within_i = min(1.0, within_per_node / max(1, sizes[i] - 1)) + for j in range(num_communities): + row.append(p_within_i if i == j else p_between) + p_matrix.append(row) + return p_matrix + + @staticmethod + def _apply_preferential_attachment( + G: nx.Graph, + community_labels: dict, + pa_fraction: float, + rng: random.Random, + ) -> None: + """Rewire within-community edges using preferential attachment. + + For each within-community edge, with probability pa_fraction, + replace the target with a degree-proportional random node in the + same community. Creates hub nodes (power-law-ish degree distribution). + + Only within-community edges are rewired to preserve community structure. + + Args: + G: Undirected graph to modify in-place. + community_labels: Mapping of node index -> community ID. + pa_fraction: Probability of rewiring each within-community edge. + rng: Random instance for reproducibility. + """ + comm_nodes: dict = {} + for node, comm in community_labels.items(): + comm_nodes.setdefault(comm, []).append(node) + + # Cache: degree array per community, indexed by position in comm_nodes[comm] + comm_degrees: dict = {} + comm_node_idx: dict = {} + for comm, members in comm_nodes.items(): + idx_map = {node: i for i, node in enumerate(members)} + comm_node_idx[comm] = idx_map + comm_degrees[comm] = np.array( + [G.degree(w) + 1 for w in members], dtype=np.float64 + ) + + within_edges = [ + (u, v) for u, v in G.edges() + if community_labels.get(u) == community_labels.get(v) + ] + + for u, v in within_edges: + if rng.random() >= pa_fraction: + continue + + comm = community_labels[u] + members = comm_nodes[comm] + degrees = comm_degrees[comm] + + # Weighted sample using cached degree array + searchsorted + cumsum = degrees.cumsum() + total = cumsum[-1] + r = rng.random() * total + chosen_idx = int(np.searchsorted(cumsum, r)) + chosen_idx = min(chosen_idx, len(members) - 1) + new_target = members[chosen_idx] + + if new_target != u and not G.has_edge(u, new_target): + G.remove_edge(u, v) + G.add_edge(u, new_target) + + # Incrementally update degree cache + idx_map = comm_node_idx[comm] + if v in idx_map: + degrees[idx_map[v]] = max(1, degrees[idx_map[v]] - 1) + if new_target in idx_map: + degrees[idx_map[new_target]] += 1 + + @staticmethod + def _apply_reciprocity( + G_undirected: nx.Graph, + reciprocity: float, + rng: random.Random, + ) -> nx.DiGraph: + """Convert an undirected graph to directed with configurable reciprocity. + + For each undirected edge: + - With probability `reciprocity`: create both A→B and B→A (mutual trust) + - With probability `1 - reciprocity`: create only one direction (random) + + All edges get relationship="TRUSTS". + + Args: + G_undirected: Source undirected graph. + reciprocity: Fraction of edges that become mutual. + rng: Random instance for reproducibility. + + Returns: + New directed graph with TRUSTS edges. + """ + G_dir = nx.DiGraph() + G_dir.add_nodes_from(G_undirected.nodes(data=True)) + + for u, v in G_undirected.edges(): + if rng.random() < reciprocity: + G_dir.add_edge(u, v, relationship="TRUSTS") + G_dir.add_edge(v, u, relationship="TRUSTS") + else: + if rng.random() < 0.5: + G_dir.add_edge(u, v, relationship="TRUSTS") + else: + G_dir.add_edge(v, u, relationship="TRUSTS") + + return G_dir + + @staticmethod + def _add_organic_distrust( + G: nx.DiGraph, + avg_distrust_links: float, + rng: random.Random, + np_rng: np.random.Generator, + ) -> None: + """Add Poisson-distributed DISTRUSTS edges per user. + + Each user draws k ~ Poisson(avg_distrust_links) distrust targets, + producing a realistic distribution where most users distrust at + least one entity (~86.5% at lambda=2.0) but some naturally have none. + + Args: + G: Directed graph to modify in-place. + avg_distrust_links: Lambda for Poisson draw per user. + rng: Random instance for reproducibility. + np_rng: NumPy random generator for Poisson draws. + """ + if avg_distrust_links <= 0: + return + + nodes = list(G.nodes()) + n = len(nodes) + node_to_idx = {node: i for i, node in enumerate(nodes)} + draws = np_rng.poisson(lam=avg_distrust_links, size=n) + + # Pre-build successor index sets for all nodes: O(E) total + successor_idxs: list = [set() for _ in range(n)] + for u, v in G.edges(): + ui = node_to_idx.get(u) + vi = node_to_idx.get(v) + if ui is not None and vi is not None: + successor_idxs[ui].add(vi) + + for idx in range(n): + k = int(draws[idx]) + if k == 0: + continue + + excluded = successor_idxs[idx] + excluded.add(idx) # no self-loops + + n_candidates = n - len(excluded) + if n_candidates <= 0: + continue + k = min(k, n_candidates) + + # Rejection sampling: with ~15 excluded out of 10k+, + # collision rate is <0.2%, so this is nearly O(k) + selected: set = set() + while len(selected) < k: + r = np_rng.integers(0, n) + if r not in excluded and r not in selected: + selected.add(r) + + src = nodes[idx] + for si in selected: + G.add_edge(src, nodes[si], relationship="DISTRUSTS") + + @staticmethod + def _add_bot_clusters( + G: nx.DiGraph, + num_compromised: int, + bots_per_cluster: list, + rng: random.Random, + np_rng: np.random.Generator, + ) -> dict: + """Add bot cluster substructures around compromised accounts. + + Selects high-degree existing users as compromised accounts, then + creates dense bot clusters connected to them — matching the + attack pattern where compromised accounts bridge to bot-generated + account clusters. + + Args: + G: Directed graph to modify in-place. + num_compromised: Number of existing users to mark as compromised. + bots_per_cluster: List of bot counts, one per compromised account. + rng: Random instance for reproducibility. + np_rng: NumPy random generator. + + Returns: + Dict mapping compromised node IDs to list of their bot node IDs. + """ + if num_compromised <= 0 or not bots_per_cluster: + return {} + + # Select compromised accounts: above-median out-degree users + user_nodes = [n for n, d in G.nodes(data=True) if not d.get("is_bot", False)] + out_degrees = {n: G.out_degree(n) for n in user_nodes} + median_deg = sorted(out_degrees.values())[len(out_degrees) // 2] + high_degree = [n for n, d in out_degrees.items() if d >= median_deg] + num_compromised = min(num_compromised, len(high_degree)) + compromised = rng.sample(high_degree, num_compromised) + + for node in compromised: + G.nodes[node]["is_compromised"] = True + + # Build community map for concentrated distrust later + communities_map: dict = {} + for node, data in G.nodes(data=True): + c = data.get("community", -1) + if c >= 0: + communities_map.setdefault(c, []).append(node) + + cluster_map = {} + if seed_val := np_rng.integers(0, 2**31): + Faker.seed(int(seed_val)) + + for ci, comp_node in enumerate(compromised): + n_bots = bots_per_cluster[ci] if ci < len(bots_per_cluster) else bots_per_cluster[-1] + bot_ids = [] + for bi in range(n_bots): + bot_id = f"bot_{ci}_{bi}" + G.add_node( + bot_id, + type="User", + is_bot=True, + is_compromised=False, + name=fake.name(), + public_key=secrets.token_hex(32), + created_at=str(fake.date_time_between(start_date="-1y")), + community=-1, + ) + bot_ids.append(bot_id) + + # Wire compromised <-> bots + for bot_id in bot_ids: + G.add_edge(comp_node, bot_id, relationship="TRUSTS") + G.add_edge(bot_id, comp_node, relationship="TRUSTS") + + # Dense intra-cluster bot trust (~50% pairwise) + for i, b1 in enumerate(bot_ids): + for b2 in bot_ids[i + 1:]: + if rng.random() < 0.5: + G.add_edge(b1, b2, relationship="TRUSTS") + if rng.random() < 0.5: + G.add_edge(b2, b1, relationship="TRUSTS") + + # Concentrated distrust: nearby users flag the compromised account + comp_comm = G.nodes[comp_node].get("community", -1) + if comp_comm >= 0 and comp_comm in communities_map: + comm_members = [ + n for n in communities_map[comp_comm] + if n != comp_node and not G.nodes[n].get("is_compromised", False) + ] + num_distrusters = max(1, len(comm_members) // 10) + num_distrusters = min(num_distrusters, len(comm_members)) + distrusters = rng.sample(comm_members, num_distrusters) + for d_node in distrusters: + if not G.has_edge(d_node, comp_node) or \ + G.edges[d_node, comp_node].get("relationship") != "DISTRUSTS": + G.add_edge(d_node, comp_node, relationship="DISTRUSTS") + + cluster_map[comp_node] = bot_ids + + return cluster_map + + @staticmethod + def _rewire_small_world( + G: nx.DiGraph, + rewire_prob: float, + rng: random.Random, + ) -> None: + """Rewire edges for small-world shortcuts. + + For a fraction of directed edges, replace the target with a random + node to create long-range connections. This reduces average path + length while preserving clustering structure. + + Args: + G: Directed graph to modify in-place. + rewire_prob: Probability of rewiring each edge. + rng: Random instance for reproducibility. + """ + nodes = list(G.nodes()) + edges_to_rewire = [ + (u, v) for u, v, d in G.edges(data=True) + if d.get("relationship") == "TRUSTS" and rng.random() < rewire_prob + ] + + for u, v in edges_to_rewire: + new_target = rng.choice(nodes) + if new_target != u and not G.has_edge(u, new_target): + G.remove_edge(u, v) + G.add_edge(u, new_target, relationship="TRUSTS") + + @staticmethod + def _fast_sbm( + sizes: list, + p_matrix: list, + seed: Optional[int] = None, + ) -> nx.Graph: + """Generate an undirected SBM graph using vectorized numpy sampling. + + For each block (i, j) in the probability matrix: + 1. Draw edge count from Binomial(n_possible_pairs, p) + 2. Sample that many unique random pairs via np.random.choice + 3. Assemble into scipy COO sparse matrix -> NetworkX Graph + + Complexity: O(E + B^2) where E = edges, B = number of blocks. + """ + from scipy.sparse import coo_matrix + + rng = np.random.default_rng(seed) + n = sum(sizes) + num_blocks = len(sizes) + offsets = [0] + for s in sizes: + offsets.append(offsets[-1] + s) + + all_rows: list = [] + all_cols: list = [] + + for i in range(num_blocks): + for j in range(i, num_blocks): # upper triangle + diagonal + ni, nj = sizes[i], sizes[j] + p = p_matrix[i][j] + if p <= 0: + continue + + if i == j: + # Within-block: sample from upper triangle (no self-loops) + n_possible = ni * (ni - 1) // 2 + if n_possible == 0: + continue + n_edges = int(rng.binomial(n_possible, p)) + if n_edges == 0: + continue + + # Sample flat indices into upper triangle + flat = rng.choice(n_possible, size=n_edges, replace=False) + + # Convert flat upper-triangle index to (row, col) + row = ( + ni - 2 + - np.floor( + np.sqrt(-8.0 * flat + 4.0 * ni * (ni - 1) - 7.0) + / 2.0 + - 0.5 + ).astype(np.intp) + ) + col = ( + flat + + row + + 1 + - ni * (ni - 1) // 2 + + ((ni - row) * (ni - row - 1)) // 2 + ) + + row = row + offsets[i] + col = col + offsets[i] + else: + # Between-block: sample from full ni x nj grid + n_possible = ni * nj + n_edges = int(rng.binomial(n_possible, p)) + if n_edges == 0: + continue + + flat = rng.choice(n_possible, size=n_edges, replace=False) + row = flat // nj + offsets[i] + col = flat % nj + offsets[j] + + all_rows.append(row) + all_cols.append(col) + + if all_rows: + rows = np.concatenate(all_rows) + cols = np.concatenate(all_cols) + else: + rows = np.array([], dtype=np.intp) + cols = np.array([], dtype=np.intp) + + # Build symmetric adjacency (undirected) + data = np.ones(len(rows), dtype=np.int8) + adj = coo_matrix((data, (rows, cols)), shape=(n, n)) + adj = adj + adj.T # symmetrize + + G = nx.from_scipy_sparse_array(adj, create_using=nx.Graph()) + return G + + @staticmethod + def build_graph( + total_users: int = 10000, + avg_trust_links: int = 15, + reciprocity: float = 0.7, + num_communities: Optional[int] = None, + community_mixing: float = 0.15, + avg_distrust_links: float = 2.0, + bot_fraction: float = 0.10, + rewire_prob: float = 0.05, + seed: Optional[int] = None, + ) -> nx.DiGraph: + """Generate a realistic directed social trust graph. + + Pipeline: + 1. Build community structure via Stochastic Block Model + 2. Preferential attachment rewiring for hub emergence + 3. Convert to directed edges with configurable reciprocity + 4. Small-world rewiring for realistic path lengths + 5. Relabel nodes to user_N and attach Faker-generated attributes + 6. Add organic Poisson-distributed distrust links + 7. Add bot clusters around compromised accounts + + Args: + total_users: Number of user nodes. + avg_trust_links: Target average outgoing trust edges per user. + reciprocity: Fraction of edges that are mutual (0.0 to 1.0). + num_communities: Number of community clusters. None = auto (~500 users per community). + community_mixing: Fraction of edges crossing community boundaries (0.0 to 1.0). + avg_distrust_links: Average DISTRUSTS edges per user (Poisson lambda). + bot_fraction: Fraction of nodes that are bots (0.0 to 1.0). Default 0.10 (10%). + rewire_prob: Probability of rewiring each edge for small-world shortcuts. + seed: Random seed for reproducibility. + + Returns: + nx.DiGraph with User nodes and TRUSTS/DISTRUSTS edges. + """ + rng = random.Random(seed) + np_rng = np.random.default_rng(seed) + + logger.info( + f"Generating trust graph: {total_users} users, " + f"~{avg_trust_links} avg links, " + f"reciprocity={reciprocity}, " + f"communities={'auto' if num_communities is None else num_communities}" + ) + + # Step 1: Community structure via SBM + sizes = TrustGraphFetcher._build_community_sizes( + total_users, num_communities, seed=seed + ) + actual_num_communities = len(sizes) + + p_matrix = TrustGraphFetcher._build_probability_matrix( + sizes, avg_trust_links, community_mixing, total_users + ) + + G_undirected = TrustGraphFetcher._fast_sbm(sizes, p_matrix, seed=seed) + + logger.info( + f"SBM generated: {G_undirected.number_of_nodes()} nodes, " + f"{G_undirected.number_of_edges()} undirected edges, " + f"{actual_num_communities} communities" + ) + + # Build community label mapping from SBM partition + community_labels = {} + node_idx = 0 + for comm_id, size in enumerate(sizes): + for _ in range(size): + community_labels[node_idx] = comm_id + node_idx += 1 + + # Step 1b: Preferential attachment for hub emergence + TrustGraphFetcher._apply_preferential_attachment( + G_undirected, community_labels, pa_fraction=0.4, rng=rng + ) + + # Step 2: Convert to directed with reciprocity + G = TrustGraphFetcher._apply_reciprocity(G_undirected, reciprocity, rng) + del G_undirected # Free undirected graph early + + logger.info( + f"Directed graph: {G.number_of_edges()} edges " + f"(reciprocity={reciprocity})" + ) + + # Step 3: Small-world rewiring (TRUSTS only) + if rewire_prob > 0: + TrustGraphFetcher._rewire_small_world(G, rewire_prob, rng) + + # Step 4: Relabel integer nodes to user_N and add attributes + mapping = {i: f"user_{i}" for i in range(total_users)} + G = nx.relabel_nodes(G, mapping, copy=False) + + if seed is not None: + Faker.seed(seed) + TrustGraphFetcher.generate_users(G, total_users, community_labels) + + # Step 5: Add organic distrust + if avg_distrust_links > 0: + TrustGraphFetcher._add_organic_distrust(G, avg_distrust_links, rng, np_rng) + distrust_count = sum( + 1 for _, _, d in G.edges(data=True) + if d.get("relationship") == "DISTRUSTS" + ) + logger.info(f"Added {distrust_count} organic distrust links") + + # Step 6: Add bot clusters (derive structure from bot_fraction) + total_bots = round(total_users * bot_fraction) + if total_bots > 0: + # Target ~15 bots per cluster, scale compromised accounts accordingly + bpc = min(15, total_bots) + num_compromised = max(1, total_bots // bpc) + bpc = total_bots // num_compromised + remainder = total_bots - (num_compromised * bpc) + # Distribute bots across clusters, spreading remainder + bots_per_cluster = [bpc] * num_compromised + for i in range(remainder): + bots_per_cluster[i] += 1 + + cluster_map = TrustGraphFetcher._add_bot_clusters( + G, num_compromised, bots_per_cluster, rng, np_rng + ) + actual_bots = sum(len(v) for v in cluster_map.values()) + logger.info( + f"Added {len(cluster_map)} bot clusters " + f"({actual_bots} bot nodes total, bot_fraction={bot_fraction})" + ) + + logger.info( + f"Trust graph complete: {G.number_of_nodes()} nodes, " + f"{G.number_of_edges()} edges" + ) + + return G diff --git a/pyproject.toml b/pyproject.toml index fb93a8e..ad88a39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,6 +21,7 @@ keywords = ["faker", "graph-data", "flights", "osmnx", "graphs", "graphfaker"] dependencies = [ "faker>=37.1.0", "networkx>=3.4.2", + "numpy>=1.24.0", "osmnx==2.0.2", "pandas>=2.2.2", "requests>=2.32.3", diff --git a/tests/test_fetchers_trust.py b/tests/test_fetchers_trust.py new file mode 100644 index 0000000..a8e604d --- /dev/null +++ b/tests/test_fetchers_trust.py @@ -0,0 +1,355 @@ +# tests/test_fetchers_trust.py +import random + +import pytest +import networkx as nx +import numpy as np + +from graphfaker.fetchers.trust import TrustGraphFetcher + + +class TestBuildCommunities: + def test_default_communities(self): + sizes = TrustGraphFetcher._build_community_sizes(100, None, seed=42) + assert sum(sizes) == 100 + assert len(sizes) == 2 # 100 / 500 rounds to 0, clamped to min 2 + + def test_explicit_communities(self): + sizes = TrustGraphFetcher._build_community_sizes(10000, 5, seed=42) + assert sum(sizes) == 10000 + assert len(sizes) == 5 + assert all(s >= 50 for s in sizes) + + def test_more_communities_than_users(self): + sizes = TrustGraphFetcher._build_community_sizes(3, 10, seed=42) + assert sum(sizes) == 3 + assert len(sizes) == 3 # clamped to total_users + + def test_single_community(self): + sizes = TrustGraphFetcher._build_community_sizes(50, 1, seed=42) + assert sizes == [50] + + def test_variable_community_sizes(self): + """Verify log-normal produces non-uniform sizes with min floor.""" + sizes = TrustGraphFetcher._build_community_sizes(10000, 20, seed=42) + assert sum(sizes) == 10000 + assert len(sizes) == 20 + assert all(s >= 50 for s in sizes) + # Sizes should NOT all be equal (log-normal is skewed) + assert len(set(sizes)) > 1 + + def test_reproducible_with_seed(self): + s1 = TrustGraphFetcher._build_community_sizes(10000, 20, seed=99) + s2 = TrustGraphFetcher._build_community_sizes(10000, 20, seed=99) + assert s1 == s2 + + +class TestProbabilityMatrix: + def test_matrix_shape(self): + sizes = [250, 250, 250, 250] + matrix = TrustGraphFetcher._build_probability_matrix( + sizes=sizes, avg_trust_links=15, + community_mixing=0.15, total_users=1000, + ) + assert len(matrix) == 4 + assert all(len(row) == 4 for row in matrix) + + def test_within_greater_than_between(self): + sizes = [250, 250, 250, 250] + matrix = TrustGraphFetcher._build_probability_matrix( + sizes=sizes, avg_trust_links=15, + community_mixing=0.15, total_users=1000, + ) + p_within = matrix[0][0] + p_between = matrix[0][1] + assert p_within > p_between + + def test_within_greater_than_between_variable_sizes(self): + """Even with the largest community, p_within > p_between.""" + sizes = [2000, 800, 600, 400, 200] + matrix = TrustGraphFetcher._build_probability_matrix( + sizes=sizes, avg_trust_links=15, + community_mixing=0.15, total_users=4000, + ) + # Check for every row (community) + for i in range(len(sizes)): + p_within = matrix[i][i] + p_between = matrix[i][(i + 1) % len(sizes)] + assert p_within > p_between, f"Community {i} (size={sizes[i]}): p_within={p_within} <= p_between={p_between}" + + def test_probabilities_capped_at_one(self): + sizes = [5, 5] + matrix = TrustGraphFetcher._build_probability_matrix( + sizes=sizes, avg_trust_links=500, + community_mixing=0.5, total_users=10, + ) + for row in matrix: + for p in row: + assert 0.0 <= p <= 1.0 + + +class TestApplyReciprocity: + @pytest.fixture + def simple_undirected(self): + G = nx.Graph() + G.add_edges_from([(0, 1), (1, 2), (2, 3), (3, 4)]) + return G + + def test_full_reciprocity(self, simple_undirected): + rng = random.Random(42) + G_dir = TrustGraphFetcher._apply_reciprocity(simple_undirected, 1.0, rng) + assert isinstance(G_dir, nx.DiGraph) + # With reciprocity=1.0, every undirected edge becomes two directed edges + assert G_dir.number_of_edges() == 8 # 4 undirected * 2 + + def test_zero_reciprocity(self, simple_undirected): + rng = random.Random(42) + G_dir = TrustGraphFetcher._apply_reciprocity(simple_undirected, 0.0, rng) + # With reciprocity=0.0, every undirected edge becomes one directed edge + assert G_dir.number_of_edges() == 4 + + def test_edge_relationship_attribute(self, simple_undirected): + rng = random.Random(42) + G_dir = TrustGraphFetcher._apply_reciprocity(simple_undirected, 0.5, rng) + for u, v, data in G_dir.edges(data=True): + assert data["relationship"] == "TRUSTS" + + +class TestOrganicDistrust: + def test_organic_distrust_distribution(self): + """Most nodes should have >=1 DISTRUSTS edge at avg=2.0.""" + rng = random.Random(42) + np_rng = np.random.default_rng(42) + G = nx.DiGraph() + n = 1000 + for i in range(n): + G.add_node(f"user_{i}") + # Add some trust edges so the graph is non-trivial + for i in range(n - 1): + G.add_edge(f"user_{i}", f"user_{i+1}", relationship="TRUSTS") + + TrustGraphFetcher._add_organic_distrust(G, 2.0, rng, np_rng) + + distrust_edges = [ + (u, v) for u, v, d in G.edges(data=True) + if d.get("relationship") == "DISTRUSTS" + ] + distrust_sources = set(u for u, _ in distrust_edges) + + # At lambda=2.0, ~86.5% should have >=1 distrust link + assert len(distrust_sources) / n > 0.75, ( + f"Only {len(distrust_sources)}/{n} users have distrust links" + ) + # Total should be roughly n * avg (within 50% tolerance) + assert n * 1.0 < len(distrust_edges) < n * 3.5, ( + f"Expected ~{n * 2} distrust edges, got {len(distrust_edges)}" + ) + + def test_zero_distrust(self): + """avg_distrust_links=0.0 should produce no DISTRUSTS edges.""" + rng = random.Random(42) + np_rng = np.random.default_rng(42) + G = nx.DiGraph() + for i in range(10): + G.add_node(f"user_{i}") + for i in range(9): + G.add_edge(f"user_{i}", f"user_{i+1}", relationship="TRUSTS") + + TrustGraphFetcher._add_organic_distrust(G, 0.0, rng, np_rng) + + distrust_edges = [ + (u, v) for u, v, d in G.edges(data=True) + if d.get("relationship") == "DISTRUSTS" + ] + assert len(distrust_edges) == 0 + + def test_distrust_no_overlap_with_trust(self): + """DISTRUSTS targets must not have TRUSTS from the same source.""" + rng = random.Random(42) + np_rng = np.random.default_rng(42) + G = nx.DiGraph() + n = 200 + for i in range(n): + G.add_node(f"user_{i}") + for i in range(n - 1): + G.add_edge(f"user_{i}", f"user_{i+1}", relationship="TRUSTS") + + TrustGraphFetcher._add_organic_distrust(G, 2.0, rng, np_rng) + + trust_set = set( + (u, v) for u, v, d in G.edges(data=True) + if d.get("relationship") == "TRUSTS" + ) + distrust_set = set( + (u, v) for u, v, d in G.edges(data=True) + if d.get("relationship") == "DISTRUSTS" + ) + overlap = trust_set & distrust_set + assert len(overlap) == 0, f"Found {len(overlap)} overlapping TRUSTS/DISTRUSTS edges" + + +class TestBotClusters: + @pytest.fixture + def graph_with_bots(self): + """Build a small graph and add bot clusters.""" + G = TrustGraphFetcher.build_graph( + total_users=200, + avg_trust_links=5, + reciprocity=0.7, + num_communities=4, + community_mixing=0.15, + avg_distrust_links=1.0, + bot_fraction=0.05, + seed=42, + ) + return G + + def test_bot_nodes_created(self, graph_with_bots): + """Verify expected number of bot nodes exist.""" + bot_nodes = [n for n, d in graph_with_bots.nodes(data=True) if d.get("is_bot")] + assert len(bot_nodes) == 10 # round(200 * 0.05) = 10 + + def test_compromised_nodes_marked(self, graph_with_bots): + """Verify compromised account count — derived from total bots / ~15 per cluster.""" + comp_nodes = [n for n, d in graph_with_bots.nodes(data=True) if d.get("is_compromised")] + assert len(comp_nodes) >= 1 + + def test_bot_cluster_connectivity(self, graph_with_bots): + """Each bot should be connected to exactly one compromised account in both directions.""" + G = graph_with_bots + comp_nodes = [n for n, d in G.nodes(data=True) if d.get("is_compromised")] + bot_nodes = [n for n, d in G.nodes(data=True) if d.get("is_bot")] + + for bot in bot_nodes: + # Each bot must have bidirectional TRUSTS with exactly one compromised node + linked_comp = [c for c in comp_nodes if G.has_edge(c, bot) and G.has_edge(bot, c)] + assert len(linked_comp) == 1, ( + f"Bot {bot} should connect to exactly 1 compromised node, found {len(linked_comp)}" + ) + + def test_bot_ids_format(self, graph_with_bots): + """Bot node IDs should match bot_* pattern.""" + bot_nodes = [n for n, d in graph_with_bots.nodes(data=True) if d.get("is_bot")] + for bot in bot_nodes: + assert bot.startswith("bot_"), f"Unexpected bot ID format: {bot}" + + def test_no_bots_when_zero(self): + """bot_fraction=0.0 should produce no bots or compromised markers.""" + G = TrustGraphFetcher.build_graph( + total_users=100, + avg_trust_links=5, + bot_fraction=0.0, + seed=42, + ) + bot_nodes = [n for n, d in G.nodes(data=True) if d.get("is_bot")] + comp_nodes = [n for n, d in G.nodes(data=True) if d.get("is_compromised")] + assert len(bot_nodes) == 0 + assert len(comp_nodes) == 0 + + def test_convergent_distrust_on_compromised(self, graph_with_bots): + """Compromised accounts should have multiple incoming DISTRUSTS edges.""" + G = graph_with_bots + comp_nodes = [n for n, d in G.nodes(data=True) if d.get("is_compromised")] + for comp in comp_nodes: + incoming_distrust = [ + u for u in G.predecessors(comp) + if G.edges[u, comp].get("relationship") == "DISTRUSTS" + ] + assert len(incoming_distrust) >= 1, ( + f"Compromised node {comp} has {len(incoming_distrust)} incoming DISTRUSTS" + ) + + def test_bot_community_is_negative_one(self, graph_with_bots): + """Bot nodes should have community=-1.""" + bot_nodes = [n for n, d in graph_with_bots.nodes(data=True) if d.get("is_bot")] + for bot in bot_nodes: + assert graph_with_bots.nodes[bot]["community"] == -1 + + +class TestBuildGraph: + @pytest.fixture + def small_trust_graph(self): + return TrustGraphFetcher.build_graph( + total_users=100, + avg_trust_links=5, + reciprocity=0.7, + num_communities=4, + community_mixing=0.15, + avg_distrust_links=1.0, + bot_fraction=0.10, + seed=42, + ) + + def test_node_count(self, small_trust_graph): + # 100 users + round(100*0.10)=10 bots = 110 + assert small_trust_graph.number_of_nodes() == 110 + + def test_is_directed(self, small_trust_graph): + assert isinstance(small_trust_graph, nx.DiGraph) + + def test_node_attributes(self, small_trust_graph): + node_data = small_trust_graph.nodes["user_0"] + assert node_data["type"] == "User" + assert "name" in node_data + assert "public_key" in node_data + assert len(node_data["public_key"]) == 64 # 32 bytes hex + assert "created_at" in node_data + assert "community" in node_data + assert "is_bot" in node_data + assert "is_compromised" in node_data + + def test_edge_relationships(self, small_trust_graph): + relationships = set() + for u, v, data in small_trust_graph.edges(data=True): + relationships.add(data.get("relationship")) + assert "TRUSTS" in relationships + assert "DISTRUSTS" in relationships + + def test_has_edges(self, small_trust_graph): + assert small_trust_graph.number_of_edges() > 0 + + def test_node_id_format(self, small_trust_graph): + for node in small_trust_graph.nodes(): + assert node.startswith("user_") or node.startswith("bot_") + + def test_community_labels_assigned(self, small_trust_graph): + communities = set() + for _, data in small_trust_graph.nodes(data=True): + communities.add(data.get("community")) + assert 4 in [c for c in communities if c >= 0] or len([c for c in communities if c >= 0]) == 4 + # Bot nodes have community=-1 + assert -1 in communities + + def test_reproducible_with_seed(self): + g1 = TrustGraphFetcher.build_graph(total_users=50, seed=123) + g2 = TrustGraphFetcher.build_graph(total_users=50, seed=123) + assert g1.number_of_nodes() == g2.number_of_nodes() + assert g1.number_of_edges() == g2.number_of_edges() + assert set(g1.nodes()) == set(g2.nodes()) + + def test_default_parameters(self): + # Smoke test with defaults (but small scale) + g = TrustGraphFetcher.build_graph(total_users=50, seed=1) + assert g.number_of_nodes() > 50 # 50 users + bots + assert g.number_of_edges() > 0 + + +class TestCoreIntegration: + def test_generate_graph_trust_source(self): + from graphfaker.core import GraphFaker + gf = GraphFaker() + g = gf.generate_graph( + source="trust", + total_users=50, + avg_trust_links=5, + bot_fraction=0.06, + seed=42, + ) + assert isinstance(g, nx.DiGraph) + assert g.number_of_nodes() == 53 # 50 users + round(50*0.06)=3 bots + + def test_graphfaker_stores_graph(self): + from graphfaker.core import GraphFaker + gf = GraphFaker() + g = gf.generate_graph(source="trust", total_users=30, seed=42) + assert gf.G is g