diff --git a/.gitignore b/.gitignore index d35019c..e6b08e1 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,4 @@ docs/_build/ # JetBrains .idea +.DS_Store diff --git a/genomediff/__init__.py b/genomediff/__init__.py index 4b12a87..012bc0c 100644 --- a/genomediff/__init__.py +++ b/genomediff/__init__.py @@ -39,4 +39,35 @@ def __len__(self): return len(self.mutations) + len(self.evidence) + len(self.validation) def __iter__(self): - return itertools.chain(self.mutations, self.evidence, self.validation) \ No newline at end of file + return itertools.chain(self.mutations, self.evidence, self.validation) + +<<<<<<< HEAD + def __str__(self): + return '\n'.join(["MUTATIONS:",'\n'.join([str(x) for x in self.mutations]), + "EVIDENCE:",'\n'.join([str(x) for x in self.evidence]), + "VALIDATION:",'\n'.join(self.validation)]) + + + def remove(self,*args, mut_type=None): + ''' + Remove mutations that satisfy the given conditions. Implementation of + gdtools REMOVE for genomediff objects. + + Input: a variable number of conditions, e.g. 'gene_name==rrlA','frequency>=0.9'. + If mut_type is specified, only that mutation type will be removed. + Output: self.mutations is updated, with mutations satifying the conditions + having been removed. + ''' + updated_mutations = [] + for rec in self.mutations: + if (mut_type is None or mut_type == rec.type) and rec.satisfies(*args): + continue + else: + updated_mutations.append(rec) + + self.mutations = updated_mutations +======= + #def __str__(self): + # return '\n'.join([self.mutations,self.evidence,self.validation]) + +>>>>>>> d98f95e89b46227d188c372219531e73daa8b852 diff --git a/genomediff/parser.py b/genomediff/parser.py index 595ef5a..3fa061a 100644 --- a/genomediff/parser.py +++ b/genomediff/parser.py @@ -87,4 +87,4 @@ def __iter__(self): yield Record(type, id, self._document, parent_ids, **extra_dct) else: - raise Exception('Could not parse line #{}: {}'.format(i, line)) \ No newline at end of file + raise Exception('Could not parse line #{}: {}'.format(i, line)) diff --git a/genomediff/records.py b/genomediff/records.py index 5cd9927..72982a5 100644 --- a/genomediff/records.py +++ b/genomediff/records.py @@ -1,3 +1,5 @@ +import re + class Metadata(object): def __init__(self, name, value): self.name = name @@ -32,12 +34,71 @@ def __getattr__(self, item): raise AttributeError -def __repr__(self): - return "Record('{}', {}, {}, {})".format(self.type, + def __repr__(self): + return "Record('{}', {}, {}, {})".format(self.type, self.id, self.parent_ids, - ', '.join('{}={}'.format(k, repr(v)) for k, v in self._extra.items())) + ', '.join('{}={}'.format(k, repr(v)) for k, v in self.attributes.items())) + +<<<<<<< HEAD + def __str__(self): + return self.__repr__() + +======= +>>>>>>> d98f95e89b46227d188c372219531e73daa8b852 + + def __eq__(self, other): + ''' this definition allows identical mutations in different genome diffs + to be equal.''' + return self.type == other.type and self.attributes == other.attributes + + def __ne__(self, other): + return not self.__eq__(other) +<<<<<<< HEAD + + def satisfies(self, *args): + ''' + Input: a variable number of conditions, e.g. 'gene_name==rrlA','frequency>=0.9'. + Output: return true if all conditions are true (i.e. correspond to key-values in attributes. + + Find a condition that evaluates to false, otherwise return True. + ''' + + ## helper function to check if values are numbers + def is_number(s): + try: + float(s) + return True + except ValueError: + return False + + for c in args: + assert type(c) == str, "error: supplied condition is not a string." + condition_pattern = re.compile(r'^(?P[_a-z]+)' + '(?P==|!=|<|<=|>|>=)' + '(?P[-_a-zA-Z0-9\.]+)') + condition_match = condition_pattern.match(c) + assert condition_match, "the supplied condition\n"+c+"\n could not be parsed." + cond_key = condition_match.group('key') + cond_comp = condition_match.group('comp') + cond_val = condition_match.group('val') + try: ## in case the given condition is not in the attributes. + attribute_val = self.attributes[cond_key] + except: + continue + + ## add quote marks around strings before eval. can leave numbers alone. + if not is_number(cond_val): + cond_val = "\'"+cond_val+"\'" -def __eq__(self, other): - return self.__dict__ == other.__dict__ + if not is_number(attribute_val): + attribute_val = "\'"+attribute_val+"\'" + else: ## attribute_val is a number in this record-- convert to str for eval. + attribute_val = str(attribute_val) + expr = attribute_val+cond_comp+cond_val + if not eval(expr): + return False + return True +======= +>>>>>>> d98f95e89b46227d188c372219531e73daa8b852 diff --git a/tests.py b/tests.py index bcf84bc..95f5ca0 100644 --- a/tests.py +++ b/tests.py @@ -88,6 +88,65 @@ def test_resolve(self): document = GenomeDiff.read(file) self.assertEqual(document[1].parents, [document[2]]) +class RecordComparisonTestCase(TestCase): + def test_cmp1(self): + file1 = StringIO(""" +#=GENOME_DIFF 1.0 +#=CREATED 20:02:17 23 Jan 2019 +#=PROGRAM breseq 0.33.2 +#=COMMAND breseq -r LCA.gff3 sequence-data/DM0 evolved re-runs (Rohan)/ZDBp889_R1.fastq.gz sequence-data/DM0 evolved re-runs (Rohan)/ZDBp889_R2.fastq.gz sequence-data/ZDBp889_reads.fastq -o consensus/ZDBp889 +#=REFSEQ LCA.gff3 +#=READSEQ sequence-data/DM0 evolved re-runs (Rohan)/ZDBp889_R1.fastq.gz +#=READSEQ sequence-data/DM0 evolved re-runs (Rohan)/ZDBp889_R2.fastq.gz +#=READSEQ sequence-data/ZDBp889_reads.fastq +#=CONVERTED-BASES 644779377 +#=CONVERTED-READS 14448149 +#=INPUT-BASES 645034321 +#=INPUT-READS 14455411 +#=MAPPED-BASES 602854657 +#=MAPPED-READS 13788351 +SNP 1 34 REL606 72313 C + """.strip()) + + document1 = GenomeDiff.read(file1) + + file2 = StringIO(""" +#=GENOME_DIFF 1.0 +#=CREATED 16:49:49 23 Jan 2019 +#=PROGRAM breseq 0.33.2 +#=COMMAND breseq -r LCA.gff3 sequence-data/DM0 evolved re-runs (Rohan)/ZDB67_R1.fastq.gz sequence-data/DM0 evolved re-runs (Rohan)/ZDB67_R2.fastq.gz -o consensus/ZDB67 +#=REFSEQ LCA.gff3 +#=READSEQ sequence-data/DM0 evolved re-runs (Rohan)/ZDB67_R1.fastq.gz +#=READSEQ sequence-data/DM0 evolved re-runs (Rohan)/ZDB67_R2.fastq.gz +#=CONVERTED-BASES 114566968 +#=CONVERTED-READS 419781 +#=INPUT-BASES 114567554 +#=INPUT-READS 419783 +#=MAPPED-BASES 92472620 +#=MAPPED-READS 339813 +SNP 1 12 REL606 72313 C + """.strip()) + + document2 = GenomeDiff.read(file2) + self.assertEqual(document1.mutations,document2.mutations) + + + def test_cmp2(self): + file1 = StringIO(""" +#=GENOME_DIFF 1.0 +SNP 1 12 REL606 72313 C aa_new_seq=G aa_position=92 aa_ref_seq=D codon_new_seq=GGC codon_number=92 codon_position=2 codon_ref_seq=GAC gene_name=araA gene_position=275 gene_product=L-arabinose isomerase gene_strand=< genes_overlapping=araA locus_tag=ECB_00064 locus_tags_overlapping=ECB_00064 mutation_category=snp_nonsynonymous position_end=72313 position_start=72313 snp_type=nonsynonymous transl_table=11 + """.strip()) + + document1 = GenomeDiff.read(file1) + + file2 = StringIO(""" +#=GENOME_DIFF 1.0 +SNP 1 34 REL606 72313 C aa_new_seq=G aa_position=92 aa_ref_seq=D codon_new_seq=GGC codon_number=92 codon_position=2 codon_ref_seq=GAC gene_name=araA gene_position=275 gene_product=L-arabinose isomerase gene_strand=< genes_overlapping=araA locus_tag=ECB_00064 locus_tags_overlapping=ECB_00064 mutation_category=snp_nonsynonymous position_end=72313 position_start=72313 snp_type=nonsynonymous transl_table=11 + """.strip()) + + document2 = GenomeDiff.read(file2) + self.assertEqual(document1.mutations,document2.mutations) + if __name__ == '__main__': - main() \ No newline at end of file + main()