diff --git a/mini_project_2/db_queries.py b/mini_project_2/db_queries.py index 2375bb7..06e6261 100644 --- a/mini_project_2/db_queries.py +++ b/mini_project_2/db_queries.py @@ -3,16 +3,14 @@ """Database querying engine for part3""" -import tempfile -import shutil -import os -import operator -from pathlib import Path - import datetime -import re -from logging import getLogger +import operator +import os +import shutil +import tempfile import xml.etree.ElementTree as ElementTree +from logging import getLogger +from pathlib import Path import bsddb3 @@ -93,9 +91,11 @@ def __init__(self, ads: str, terms: str, pdates: str, prices: str, def delete_non_matching_aids(self, matching_aids): """remove ad(s) from the ads index that do not have a aid contained within ``matching_aids``""" - aids_to_delete = [aid for aid in self.ads.keys() if aid not in [bytes(key, "utf-8") for key in matching_aids]] - for aid in aids_to_delete: - self.ads.__delitem__(aid) + for aid in self.ads.keys(): + clean_aid = aid.decode("utf-8").strip() + if clean_aid not in matching_aids: + self.ads.pop(aid) + # TODO: we should also cleanup other indexes here aswell # this isn't needed functionally, but, if done would likely lead to # cleaner code. @@ -105,32 +105,34 @@ def run_term_query(self, search_term: str): word within the title or description fields""" __log__.info("starting term query: search_term: {}".format(search_term)) - if search_term.endswith("%"): - __log__.debug("wildcard detected in search_term: {}".format(search_term)) - base_term = search_term[:-1] - searching_terms = list((key.decode("utf-8").lower() for key, val in self.terms.items() if re.match(r"{}[a-zA-Z0-9\-_]*".format(base_term), key.decode("utf-8")))) - else: - searching_terms = [search_term.lower()] - - searching_terms = set(searching_terms) - - __log__.info("running search_term query: searching_terms: {}".format(searching_terms)) - term_matches = set() # look through terms - for term, data in set(self.terms.items()): + rec = self.terms.first() + while True: + term, data = rec term_str = term.decode("utf-8") data_str = data.decode("utf-8") - if term_str.lower() in searching_terms: - __log__.info("found matching db_term: {} data: {}".format(term_str, data_str)) + if search_term.endswith("%"): + base_term = search_term[:-1] + term_check = term_str.lower().startswith(base_term.lower()) + else: + term_check = term_str.lower() == search_term + if term_check: + __log__.debug("found matching db_term: {} data: {}".format(term_str, data_str)) # get the aid from the terms index - term_matches.add(self.terms[term].decode("utf-8")) + term_matches.add(data_str.strip()) else: - self.terms.__delitem__(term) + # TODO: remove? + pass + + try: + rec = self.terms.next() + except bsddb3.db.DBNotFoundError: + break for aid in term_matches: - if self.ads.has_key(bytes(aid, "utf-8")): + if bytes(aid, "utf-8") in self.ads: if self.full_output: __log__.info("found matching term: search_term: {} aid: {} ad: {}".format(search_term, aid, self.ads[bytes(aid, "utf-8")].decode("utf-8"))) else: @@ -153,7 +155,9 @@ def run_cat_query(self, search_category: str): category_matches = set() # look through prices - for price, data in set(self.prices.items()): + rec = self.prices.first() + while True: + price, data = rec price_str = price.decode("utf-8") data_str = data.decode("utf-8") db_category = get_category(data_str) @@ -161,10 +165,17 @@ def run_cat_query(self, search_category: str): __log__.debug("found matching db_location: {} price: {} data: {}".format(db_category, price_str, data_str)) category_matches.add(get_aid(data_str)) else: - self.prices.__delitem__(price) + # self.prices.pop(price) # TODO + pass + try: + rec = self.prices.next() + except bsddb3.db.DBNotFoundError: + break # look through dates - for date, data in set(self.pdates.items()): + rec = self.pdates.first() + while True: + date, data = rec date_str = date.decode("utf-8") data_str = data.decode("utf-8") db_category = get_category(data_str) @@ -172,10 +183,15 @@ def run_cat_query(self, search_category: str): __log__.debug("found matching db_location: {} date: {} data: {}".format(db_category, date_str, data_str)) category_matches.add(get_aid(data_str)) else: - self.pdates.__delitem__(date) + # self.pdates.pop(date) # TODO + pass + try: + rec = self.pdates.next() + except bsddb3.db.DBNotFoundError: + break for aid in category_matches: - if self.ads.has_key(bytes(aid, "utf-8")): + if bytes(aid, "utf-8") in self.ads: if self.full_output: __log__.info("found matching category: search_category: {} aid: {} ad: {}".format(search_category, aid, self.ads[bytes(aid, "utf-8")].decode("utf-8"))) else: @@ -197,7 +213,9 @@ def run_location_query(self, search_location: str): location_matches = set() # look through prices - for price, data in set(self.prices.items()): + rec = self.prices.first() + while True: + price, data = rec price_str = price.decode("utf-8") data_str = data.decode("utf-8") db_location = get_location(data_str) @@ -205,10 +223,17 @@ def run_location_query(self, search_location: str): __log__.debug("found matching location: {} price: {} data: {}".format(db_location, price_str, data_str)) location_matches.add(get_aid(data_str)) else: - self.prices.__delitem__(price) + # self.prices.pop(price) # TODO + pass + try: + rec = self.prices.next() + except bsddb3.db.DBNotFoundError: + break # look through dates - for date, data in set(self.pdates.items()): + rec = self.pdates.first() + while True: + date, data = rec date_str = date.decode("utf-8") data_str = data.decode("utf-8") db_location = get_location(data_str) @@ -216,10 +241,15 @@ def run_location_query(self, search_location: str): __log__.debug("found matching location: {} date: {} data: {}".format(db_location, date_str, data_str)) location_matches.add(get_aid(data_str)) else: - self.pdates.__delitem__(date) + # self.pdates.pop(date) # TODO + pass + try: + rec = self.pdates.next() + except bsddb3.db.DBNotFoundError: + break for aid in location_matches: - if self.ads.has_key(bytes(aid, "utf-8")): + if bytes(aid, "utf-8") in self.ads: if self.full_output: __log__.info("found matching location: search_location: {} aid: {} ad: {}".format(search_location, aid, self.ads[bytes(aid, "utf-8")].decode("utf-8"))) else: @@ -238,7 +268,9 @@ def run_price_query(self, search_price: int, operator_str: str): price_matches = set() # look through prices - for price, data in set(self.prices.items()): + rec = self.prices.first() + while True: + price, data = rec price_str = price.decode("utf-8") data_str = data.decode("utf-8") db_price = int(price_str) @@ -246,10 +278,15 @@ def run_price_query(self, search_price: int, operator_str: str): __log__.debug("found valid price: {} data: {}".format(price_str, data_str)) price_matches.add(get_aid(data_str)) else: - self.prices.__delitem__(price) + # self.prices.pop(price) # TODO + pass + try: + rec = self.prices.next() + except bsddb3.db.DBNotFoundError: + break for aid in price_matches: - if self.ads.has_key(bytes(aid, "utf-8")): + if bytes(aid, "utf-8") in self.ads: if self.full_output: __log__.info("found matching price: {} aid: {} ad: {}".format(search_price, aid, self.ads[bytes(aid, "utf-8")].decode("utf-8"))) else: @@ -268,8 +305,9 @@ def run_date_query(self, search_date: datetime.datetime, date_matches = set() - # look through dates - for date, data in set(self.pdates.items()): + rec = self.pdates.first() + while True: + date, data = rec date_str = date.decode("utf-8") data_str = data.decode("utf-8") db_date = parse_date(date_str) @@ -277,10 +315,15 @@ def run_date_query(self, search_date: datetime.datetime, __log__.debug("found valid date: {} data: {}".format(date_str, data_str)) date_matches.add(get_aid(data_str)) else: - self.pdates.__delitem__(date) + # self.pdates.pop(date) # TODO + pass + try: + rec = self.pdates.next() + except bsddb3.db.DBNotFoundError: + break for aid in date_matches: - if self.ads.has_key(bytes(aid, "utf-8")): + if bytes(aid, "utf-8") in self.ads: if self.full_output: __log__.info("found matching date: {} aid: {} ad: {}".format(search_date, aid, self.ads[bytes(aid, "utf-8")].decode("utf-8"))) else: @@ -306,7 +349,7 @@ def get_category(data_str: str) -> str: def get_aid(data_str: str) -> str: """Get the ad ID field from either a ``prices`` or ``pdates`` index's key's data""" - return data_str.split(",")[0] + return data_str.split(",")[0].strip() def get_title(ad: str) -> str: diff --git a/mini_project_2/scripts/data/indexes/ad20k.idx b/mini_project_2/scripts/data/indexes/ad20k.idx new file mode 100644 index 0000000..ca58c69 Binary files /dev/null and b/mini_project_2/scripts/data/indexes/ad20k.idx differ diff --git a/mini_project_2/scripts/data/indexes/pd20k.idx b/mini_project_2/scripts/data/indexes/pd20k.idx new file mode 100644 index 0000000..63a314f Binary files /dev/null and b/mini_project_2/scripts/data/indexes/pd20k.idx differ diff --git a/mini_project_2/scripts/data/indexes/pr20k.idx b/mini_project_2/scripts/data/indexes/pr20k.idx new file mode 100644 index 0000000..27dc89e Binary files /dev/null and b/mini_project_2/scripts/data/indexes/pr20k.idx differ diff --git a/mini_project_2/scripts/data/indexes/te20k.idx b/mini_project_2/scripts/data/indexes/te20k.idx new file mode 100644 index 0000000..1e4f754 Binary files /dev/null and b/mini_project_2/scripts/data/indexes/te20k.idx differ diff --git a/mini_project_2/scripts/part2.sh b/mini_project_2/scripts/part2.sh index 8e87c1f..c9e4259 100644 --- a/mini_project_2/scripts/part2.sh +++ b/mini_project_2/scripts/part2.sh @@ -1,3 +1,3 @@ #!/usr/bin/env bash -sort -t : -u $1 | ~/part2/break.pl | db_load -T -t $2 $3 +sort -t : -u $1 | ~/part2/break.pl | db_load -c duplicates=1 -T -t $2 $3 diff --git a/test/unit/test_main.py b/test/unit/test_main.py index 76d96af..d49ca94 100644 --- a/test/unit/test_main.py +++ b/test/unit/test_main.py @@ -55,7 +55,47 @@ def test_main(cmd): """Bad tests to ensure nothing is extremely broken""" base_cmd = """-ad mini_project_2/scripts/data/indexes/ads.idx -te mini_project_2/scripts/data/indexes/terms.idx -da mini_project_2/scripts/data/indexes/pdates.idx -pr mini_project_2/scripts/data/indexes/prices.idx -o full -v --log-level INFO""" main(base_cmd.split() + cmd.split()) - # TODO: more better testing. + + +@pytest.mark.parametrize("cmd", + [ + "price>-30 term=camera", + "price>0 term=camera", + "price<30 term=camera", + "price=30", + "price>=30", + "price<=30", + "price<30", + "price>30", + "term=camera", + "term=camera%", + "location=Red-deer", + "location=Calgary", + "price>30 term=camera location=Calgary", + "price>30 term=camera location=Red-Deer", + "price>30 term=camera location=Red-Deer cat=nonsuchcat", + "cat=nonsuchcat", + "date=2018/01/01", + "date>=2018/01/01", + "date<=2018/01/01", + "date>2018/01/01", + "date<2018/01/01", + "price>30 term=camera location=Red-Deer cat=nonsuchcat date>2018/01/01", + # given queries to test + "term=camera", + "term=camera%", + "date<=2018/11/05", + "date>2018/11/05", + "price<20", + "price>=20", + "location=edmonton date=2018/11/07", + "cat=art-collectibles term=camera", + "term=camera date>=2018/11/05 date<=2018/11/07 price>20 price<40" + ] + ) +def test_main_large(cmd): + base_cmd = """-ad mini_project_2/scripts/data/indexes/ad20k.idx -te mini_project_2/scripts/data/indexes/te20k.idx -da mini_project_2/scripts/data/indexes/pd20k.idx -pr mini_project_2/scripts/data/indexes/pr20k.idx -o full -v --log-level INFO""" + main(base_cmd.split() + cmd.split()) @pytest.mark.parametrize("alpha_numeric_str",