From b22d52590bedad54db0f7f525272ffa930be66c1 Mon Sep 17 00:00:00 2001 From: bethke Date: Thu, 21 Jan 2016 09:38:23 -0800 Subject: [PATCH 01/39] prfs with RDD and meta perform metrics --- src/algorithms/performance_metrics.py | 44 ++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/src/algorithms/performance_metrics.py b/src/algorithms/performance_metrics.py index ae58826..5c955b4 100644 --- a/src/algorithms/performance_metrics.py +++ b/src/algorithms/performance_metrics.py @@ -13,6 +13,19 @@ from sklearn.metrics import jaccard_similarity_score import itertools +def get_perform_metrics(y_actual, y_predicted, n=100): + results = {} + + results['rmse'] = calculate_rmse_using_rdd(y_actual, y_predicted) + results['mae'] = calculate_mae_using_rdd(y_actual,y_predicted) + results['pred_n'] = calculate_precision_at_n(y_actual, y_predicted, n=100) + + results['user_coverage']=calculate_user_coverage(y_actual, y_predicted) + + return results + + + # Accuracy of ratings predictions (aka regression metrics) ===================== # RMSE ----------------------------------------------------------------- @@ -80,11 +93,32 @@ def calculate_mae_using_rdd(y_actual, y_predicted): # Performance, Recall, Fbeta Score, Support -def calculate_prfs_using_rdd(y_actual, y_predicted): - # TODO: it is highly dependent on the labels - ## The actual and predicted interactions also need to be boolean of [interaction, no_interaction] for the sklearn precision_recall_fscore_support` - ## A better metric for recommender systems is precision at N - return +def calculate_prfs_using_rdd(y_actual, y_predicted, average='macro'): + """ + Determines the precision, recall, fscore, and support of the predictions. + With average of macro, the algorithm Calculate metrics for each label, and find their unweighted mean. + See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html for details + + A better metric for recommender systems is precision at N (also in this package) + + Args: + y_actual: actual ratings in the format of an RDD of [ (userId, itemId, actualRating) ] + y_predicted: predicted ratings in the format of an RDD of [ (userId, itemId, predictedRating) ] + + Returns: + precision, recall, fbeta_score, and support values + + """ + + prediction_rating_pairs = y_predicted.map(lambda x: ((x[0], x[1]), x[2]))\ + .join(y_actual.map(lambda x: ((x[0], x[1]), x[2])))\ + .map(lambda ((user, item), (prediction, rating)): (user, item, prediction, rating)) + + true_vals = np.array(prediction_rating_pairs.map(lambda (user, item, prediction, rating): rating).collect()) + pred_vals = np.array(prediction_rating_pairs.map(lambda (user, item, prediction, rating): prediction).collect()) + + return precision_recall_fscore_support(map(lambda x: int(np.round(x)), true_vals),\ + map(lambda x: int(np.round(x)), pred_vals), average = average) def calculate_precision_at_n(y_actual, y_predicted, number_recommended = 100): """ From 84afe8ddcdd7a1dfbeb0be381f1ec74566e67e4d Mon Sep 17 00:00:00 2001 From: bethke Date: Thu, 21 Jan 2016 17:13:39 -0800 Subject: [PATCH 02/39] performance dict update --- src/algorithms/performance_metrics.py | 31 ++++++++++++++++++++++----- src/data_prep/wiki_vectorize.py | 13 +++++------ 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/algorithms/performance_metrics.py b/src/algorithms/performance_metrics.py index 5c955b4..173edee 100644 --- a/src/algorithms/performance_metrics.py +++ b/src/algorithms/performance_metrics.py @@ -13,14 +13,35 @@ from sklearn.metrics import jaccard_similarity_score import itertools -def get_perform_metrics(y_actual, y_predicted, n=100): +def get_perform_metrics(y_test, y_train, y_predicted, content_array, n=100, num_partitions=30): results = {} - results['rmse'] = calculate_rmse_using_rdd(y_actual, y_predicted) - results['mae'] = calculate_mae_using_rdd(y_actual,y_predicted) - results['pred_n'] = calculate_precision_at_n(y_actual, y_predicted, n=100) + results['rmse'] = calculate_rmse_using_rdd(y_test, y_predicted) + results['mae'] = calculate_mae_using_rdd(y_test,y_predicted) + results['pred_n'] = calculate_precision_at_n(y_test, y_predicted, n=n) - results['user_coverage']=calculate_user_coverage(y_actual, y_predicted) + #measures of diversity + results['cat_diversity'] = calculate_population_category_diversity(y_predicted, content_array) + results['ils'] = calc_ils(y_predicted, content_array, num_partitions=num_partitions) + + #measures of coverage + results['cat_coverage'] = calculate_catalog_coverage(y_test, y_predicted) + results['item_coverage'] = calculate_item_coverage(y_test, y_predicted) + results['user_coverage'] = calculate_user_coverage(y_test, y_predicted) + results['pred_coverage'] = calculate_prediction_coverage(y_test, y_predicted) + + #measures of serendipity + results['serendipity'] = calculate_serendipity(y_train, y_test, y_predicted, rel_filter=1) + results['content_serendipity'] = calc_content_serendipity(y_test, y_predicted, content_array) + + #measures of novelty + results['novelty'] = calculate_novelty(y_train, y_test, y_predicted) + + #relevancy statistics + rel_stats = calc_relevant_rank_stats(y_test, y_predicted) + results['avg_highest_rank'] = rel_stats[0] + results['avg_mean_rank'] = rel_stats[1] + results['avg_lowest_rank'] = rel_stats[2] return results diff --git a/src/data_prep/wiki_vectorize.py b/src/data_prep/wiki_vectorize.py index 37d2eb6..4d24e5e 100644 --- a/src/data_prep/wiki_vectorize.py +++ b/src/data_prep/wiki_vectorize.py @@ -4,7 +4,7 @@ class wiki_vectorize(): - def __init__(self, user_interactions, content, user_vector_type, content_vector_type, **support_files): + def __init__(self, user_interactions, content, user_vector_type, content_vector_type, sqlCtx, **support_files): """ Class initializer to load the required files @@ -24,13 +24,14 @@ def __init__(self, user_interactions, content, user_vector_type, content_vector_ """ self.user_vector_type = user_vector_type self.content_vector_type = content_vector_type + self.sqlCtx = sqlCtx #Filter out uninteresting articles and users if they still exist in the dataset user_interactions.registerTempTable("ratings") content.registerTempTable("content") - filtered = sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") - filtered_content = sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") + filtered = self.sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") + filtered_content = self.sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") self.filtered = filtered self.filtered.registerTempTable("wiki_ratings") @@ -48,19 +49,19 @@ def __init__(self, user_interactions, content, user_vector_type, content_vector_ def get_user_vector(self): if self.user_vector_type=='num_edits': - user_info = sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings \ + user_info = self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings \ group by user_id, article_id") return user_info elif self.user_vector_type=='any_interact': - user_info = sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings \ + user_info = self.sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings \ group by user_id, article_id") return user_info elif self.user_vector_type=='num_edits_ceil': - user_info = sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki \ + user_info = self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki \ group by user_id, article_id")\ .map(lambda (user, article, rating): (user, article, max(rating, 5))) From e0c9b95421463664d709c8a8ee0e21abde1145dd Mon Sep 17 00:00:00 2001 From: bethke Date: Fri, 22 Jan 2016 09:45:33 -0800 Subject: [PATCH 03/39] save-load to hadoop file system --- src/utils/save_load.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/src/utils/save_load.py b/src/utils/save_load.py index ec96385..10475ac 100644 --- a/src/utils/save_load.py +++ b/src/utils/save_load.py @@ -76,4 +76,23 @@ def load_content_vector(input_fname): content1 = line[1].strip("[]") content = [float(i) for i in str.split(content1, ' ')] content_vector.append((item, content)) - return content_vector \ No newline at end of file + return content_vector + +def save_uv_to_hadoop(vector, output_name): + vector.map(lambda x: ','.join(map(str,x))).saveAsTextFile(output_name) + +def load_uv_from_hadoop(input_name, sc, num_partitions=20): + uv = sc.textFile(input_name).map(parseText)\ + .repartition(num_partitions) + return uv + +def parseText(row): + row = row.split(',') + return (int(row[0]), int(row[1]), float(row[2])) + +def save_cv_to_hadoop(vector, output_name): + vector.saveAsPickleFile(output_name) + +def load_cv_from_hadoop(input_name,sc, num_partitions=20): + cv = sc.pickleFile(input_name).repartition(num_partitions) + return cv \ No newline at end of file From 080777bcc8c18b78345d0d717118905cf9598442 Mon Sep 17 00:00:00 2001 From: bethke Date: Fri, 22 Jan 2016 16:43:58 -0800 Subject: [PATCH 04/39] minor bug fix --- src/data_prep/wiki_vectorize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_prep/wiki_vectorize.py b/src/data_prep/wiki_vectorize.py index 4d24e5e..6510d53 100644 --- a/src/data_prep/wiki_vectorize.py +++ b/src/data_prep/wiki_vectorize.py @@ -63,7 +63,7 @@ def get_user_vector(self): elif self.user_vector_type=='num_edits_ceil': user_info = self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki \ group by user_id, article_id")\ - .map(lambda (user, article, rating): (user, article, max(rating, 5))) + .map(lambda (user, article, rating): (user, article, min(rating, 5))) return user_info From b47cb099dc2f549b8db7330ce8f296e77cce3209 Mon Sep 17 00:00:00 2001 From: bethke Date: Tue, 26 Jan 2016 13:46:45 -0800 Subject: [PATCH 05/39] OSM to JSON fixes --- src/utils/osm_etl/osm.py | 178 ++++++++++++++++++++++----------------- 1 file changed, 100 insertions(+), 78 deletions(-) diff --git a/src/utils/osm_etl/osm.py b/src/utils/osm_etl/osm.py index 0a30798..910b684 100755 --- a/src/utils/osm_etl/osm.py +++ b/src/utils/osm_etl/osm.py @@ -63,35 +63,37 @@ #JSON osm objects (node, way, or relation): OSM_OBJECT = { "id": None, + "lat": None, + "lon": None, "timestamp": None, "version": None, "changeset": None, "visible": None, "user": None, "uid": None, - "type": None, #can be node, way or relation - "lat": None, - "lon": None, + "osm_type": None, #can be Node, Way or Relation #add in tags "source": None, "building": None, "highway": None, "name": None, - "addr:city": None, - "addr:postcode": None, + "addr_city": None, + "addr_postcode": None, "natural": None, "landuse": None, - "surfacewaterway": None, + "surface": None, + "waterway": None, "power": None, "wall": None, "oneway": None, "amenity": None, "ref": None, - "building:levels": None, + "building_levels": None, "maxspeed": None, "barrier": None, "type": None, - "placefoot": None, + "place": None, + "foot": None, "bicycle": None, "railway": None, "leisure": None, @@ -127,82 +129,102 @@ ) args = parser.parse_args() + tag_names2 = ['source', 'building', 'highway', 'name', 'addr_city', 'addr_postcode', 'natural', 'landuse', 'surface',\ + 'waterway','power','wall','oneway','amenity','ref', 'building_levels', 'maxspeed','barrier','type','place',\ + 'foot','bicycle','railway','leisure','bridge', 'parking','man_made','railway','aeroway', 'wikipedia'] + changeset_json_file = open( args.output_directory +"/changeset_test.json", 'w') node_json_file = open( args.output_directory +"/node_test.json", 'w') relation_map = open( args.output_directory +"/relation_map.txt", 'w') osm_tree = ET.iterparse(args.osm_history, events=("start", "end")) - for event, elem in osm_tree: - #print event, elem - if event == 'start' and elem.tag=='changeset': - #print elem, elem.attrib - #create a new changeset object - c = deepcopy(CHANGESET) - for key, value in elem.attrib.iteritems(): - c[key] = value #for now all values are strings - #print json.dumps(c) - changeset_json_file.write(json.dumps(c) + "\n") - - elif event == 'start' and elem.tag=='node': - #create a new node object - n = deepcopy(OSM_OBJECT) - for key, value in elem.attrib.iteritems(): - n[key] = value #for now all values are strings - n["type"] = 'Node' - for child in elem: - if child.tag =='tag': - for key, value in child.attrib.iteritems(): - n[key] = value - - elif event=='end' and elem.tag=='node': - node_json_file.write(json.dumps(n) + "\n") - - elif event == 'start' and elem.tag=='way': - #create a new node object of type way - n = deepcopy(OSM_OBJECT) - nid = elem.get('id') - for key, value in elem.attrib.iteritems(): - n[key] = value #for now all values are strings - n["type"] = 'Way' - for child in elem: - if child.tag =='tag': - for key, value in child.attrib.iteritems(): - n[key] = value - elif child.tag =='nd': - ref_id = child.get('ref') - #these are the node references, we will create a file that is w_id, n_id, w_type, n_type - relation_map.write(nid + ',' + ref_id + ', way, node' + "\n") - else: - print child.tag - - elif event=='end' and elem.tag=='way': - #print json.dumps(n) - node_json_file.write(json.dumps(n) + "\n") - - - elif event == 'start' and elem.tag=='relation': - #create a new node object of type way - n = deepcopy(OSM_OBJECT) - nid = elem.get('id') - for key, value in elem.attrib.iteritems(): - n[key] = value #for now all values are strings - n["type"] = 'Relation' - for child in elem: - if child.tag =='tag': - for key, value in child.attrib.iteritems(): - n[key] = value - elif child.tag =='member': - ref_id = child.get('ref') - r_type = child.get('type') - #these are the node references, we will create a file that is w_id, n_id, w_type, n_type - relation_map.write(nid + ',' + ref_id + ', relation, '+ r_type + "\n") - else: - print child.tag - - elif event=='end' and elem.tag=='way': - #print json.dumps(n) - node_json_file.write(json.dumps(n) + "\n") +for event, elem in osm_tree: + if event == 'start' and elem.tag=='changeset': + #print elem, elem.attrib + #create a new changeset object + c = deepcopy(CHANGESET) + for key, value in elem.attrib.iteritems(): + c[key] = value #for now all values are strings + #print json.dumps(c) + changeset_json_file.write(json.dumps(c) + "\n") + + elif event == 'start' and elem.tag=='node': + #create a new node object + n = deepcopy(OSM_OBJECT) + for key, value in elem.attrib.iteritems(): + #print key, value + key = key.replace(":", "_") + n[key] = value #for now all values are strings + n["osm_type"] = 'Node' + for child in elem: + if child.tag =='tag': + k, v = child.attrib.items() + key = k[1] + value = v[1] + key = key.replace(":", "_") + if key in tag_names2: + n[key] = value + #print key, value + + elif event=='end' and elem.tag=='node': + #print json.dumps(n) + node_json_file.write(json.dumps(n) + "\n") + + elif event == 'start' and elem.tag=='way': + #create a new node object of type way + n = deepcopy(OSM_OBJECT) + nid = elem.get('id') + for key, value in elem.attrib.iteritems(): + n[key] = value #for now all values are strings + n["osm_type"] = 'Way' + for child in elem: + if child.tag =='tag': + k, v = child.attrib.items() + key = k[1] + value = v[1] + key = key.replace(":", "_") + if key in tag_names2: + n[key] = value + elif child.tag =='nd': + ref_id = child.get('ref') + #these are the node references, we will create a file that is w_id, n_id, w_type, n_type + relation_map.write(nid + ',' + ref_id + ', way, node' + "\n") + else: + print child.tag + + elif event=='end' and elem.tag=='way': + #print json.dumps(n) + node_json_file.write(json.dumps(n) + "\n") + + + elif event == 'start' and elem.tag=='relation': + #create a new node object of type way + n = deepcopy(OSM_OBJECT) + nid = elem.get('id') + for key, value in elem.attrib.iteritems(): + key.replace(":", "_") + n[key] = value #for now all values are strings + n["osm_type"] = 'Relation' + for child in elem: + if child.tag =='tag': + k, v = child.attrib.items() + key = k[1] + value = v[1] + key = key.replace(":", "_") + if key in tag_names2: + n[key] = value + elif child.tag =='member': + ref_id = child.get('ref') + r_type = child.get('type') + #these are the node references, we will create a file that is w_id, n_id, w_type, n_type + relation_map.write(nid + ',' + ref_id + ', relation, '+ r_type + "\n") + else: + print child.tag + + elif event=='end' and elem.tag=='way': + #print json.dumps(n) + node_json_file.write(json.dumps(n) + "\n") changeset_json_file.close() From 53ae05c86522c34b545fa713434739ab7165c3b9 Mon Sep 17 00:00:00 2001 From: bethke Date: Tue, 26 Jan 2016 14:08:14 -0800 Subject: [PATCH 06/39] initial osm data vectorizer --- src/data_prep/osm_vectoize.py | 207 ++++++++++++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 src/data_prep/osm_vectoize.py diff --git a/src/data_prep/osm_vectoize.py b/src/data_prep/osm_vectoize.py new file mode 100644 index 0000000..7d27380 --- /dev/null +++ b/src/data_prep/osm_vectoize.py @@ -0,0 +1,207 @@ +import numpy as np + +class osm_vectorize(): + + def __init__(self, user_interactions, user_vector_type, content_vector_type, sqlCtx, **support_files ): + """ + Class initializer to load the required files + + Args: + user_interactions: The raw RDD of the user interactions. For OSM, these are the object edits as well as the object data + user_vector_type: The type of user vector desired. For MovieLens you can choose between ['ratings', 'pos_ratings', 'ratings_to_interact', 'none']. + If 'none' is used then this means you will run your own custom mapping + content_vector_type: The type of content vector desired. For MovieLens you can choose between ['tags_only', 'none']. + If none is chosen no content vector will be returned and None may be passed into the content argument. + You do not need a content vector to run pure CF only but some performance metrics will not be able to be ran + support_files: If they exist, the supporting files, dataFrames, and/or file links necessary to run the content vectors. + + + """ + self.user_vector_type = user_vector_type + self.content_vector_type = content_vector_type + self.sqlCtx = sqlCtx + + #Filter out uninteresting articles and users if they still exist in the dataset + self.user_interactions =user_interactions + self.user_interactions.registerTempTable("osm_data") + + #if no support files were passed in, initialize an empty support file + if support_files: + self.support_files = support_files + else: + self.support_files = {} + + + def get_user_vector(self): + + if self.user_vector_type=='ratings': + user_info = self.sqlCtx.sql("select user, id, count(1) as rating from filtered_users group by user, id") + return user_info + + elif self.user_vector_type=='any_interact': + user_info = self.user_interactions.map(lambda row: (row.user_id, row.movie_id, row.rating) ).filter(lambda (u,m,r): r>3) + return user_info + + elif self.user_vector_type=='num_edits_ceil': + user_info = self.sqlCtx.sql("select user, id, count(1) as rating from osm_data group by user, id") \ + .map(lambda (user, id_, rating) : (user, id_, min(rating, 5))) + return user_info + + elif self.user_vector_type=='none': + return None + + else: + print "Please choose a user_vector_type between 'ratings', 'any_interact', 'num_edits_ceil', and 'none'" + return None + + def get_content_vector(self): + + if self.content_vector_type=='tags_only': + content_array = self.content.map(lambda row: (row.movie_id, osm_vectorize(row))) + return content_array + + elif self.content_vector_type=='none': + return None + + else: + print "Please choose a content_vector_type between 'tags_only' or 'none'" + return None + + + + +def osm_vectorize(row): + vect = [] + if row.source is not None: + vect.append(1) + else: + vect.append(0) + if row.building is not None: + vect.append(1) + else: + vect.append(0) + if row.highway is not None: + vect.append(1) + else: + vect.append(0) + if row.name is not None: + vect.append(1) + else: + vect.append(0) + if row.addr_city is not None: + vect.append(1) + else: + vect.append(0) + if row.addr_postcode is not None: + vect.append(1) + else: + vect.append(0) + if row.natural is not None: + vect.append(1) + else: + vect.append(0) + if row.landuse is not None: + vect.append(1) + else: + vect.append(0) + if row.surface is not None: + vect.append(1) + else: + vect.append(0) + if row.waterway is not None: + vect.append(1) + else: + vect.append(0) + if row.power is not None: + vect.append(1) + else: + vect.append(0) + if row.wall is not None: + vect.append(1) + else: + vect.append(0) + if row.oneway is not None: + vect.append(1) + else: + vect.append(0) + if row.amenity is not None: + vect.append(1) + else: + vect.append(0) + if row.ref is not None: + vect.append(1) + else: + vect.append(0) + if row.building_levels is not None: + vect.append(1) + else: + vect.append(0) + if row.maxspeed is not None: + vect.append(1) + else: + vect.append(0) + if row.barrier is not None: + vect.append(1) + else: + vect.append(0) + if row.type is not None: + vect.append(1) + else: + vect.append(0) + if row.place is not None: + vect.append(1) + else: + vect.append(0) + if row.foot is not None: + vect.append(1) + else: + vect.append(0) + if row.bicycle is not None: + vect.append(1) + else: + vect.append(0) + if row.railway is not None: + vect.append(1) + else: + vect.append(0) + if row.leisure is not None: + vect.append(1) + else: + vect.append(0) + if row.bridge is not None: + vect.append(1) + else: + vect.append(0) + if row.parking is not None: + vect.append(1) + else: + vect.append(0) + if row.man_made is not None: + vect.append(1) + else: + vect.append(0) + if row.railway is not None: + vect.append(1) + else: + vect.append(0) + if row.aeroway is not None: + vect.append(1) + else: + vect.append(0) + if row.wikipedia is not None: + vect.append(1) + else: + vect.append(0) + if row.osm_type =='Node': + vect.append(1) + else: + vect.append(0) + if row.osm_type =='Way': + vect.append(1) + else: + vect.append(0) + if row.osm_type =='Relation': + vect.append(1) + else: + vect.append(0) + return vect From b77460d9c6652f0c39069d1f129624b41da2476e Mon Sep 17 00:00:00 2001 From: bethke Date: Wed, 27 Jan 2016 08:35:20 -0800 Subject: [PATCH 07/39] coverage calc modifications --- src/algorithms/performance_metrics.py | 65 ++++++++++++++++----------- src/data_prep/osm_vectoize.py | 18 +++++--- 2 files changed, 50 insertions(+), 33 deletions(-) diff --git a/src/algorithms/performance_metrics.py b/src/algorithms/performance_metrics.py index 173edee..bcd0ac1 100644 --- a/src/algorithms/performance_metrics.py +++ b/src/algorithms/performance_metrics.py @@ -16,29 +16,32 @@ def get_perform_metrics(y_test, y_train, y_predicted, content_array, n=100, num_partitions=30): results = {} - results['rmse'] = calculate_rmse_using_rdd(y_test, y_predicted) - results['mae'] = calculate_mae_using_rdd(y_test,y_predicted) - results['pred_n'] = calculate_precision_at_n(y_test, y_predicted, n=n) + #because some of the algorithms we will use will only return n predictions per user all results should be analyazed for n recommendations + n_predictions = predictions_to_n(y_predicted, n=n) + + results['rmse'] = calculate_rmse_using_rdd(y_test, n_predictions) + results['mae'] = calculate_mae_using_rdd(y_test,n_predictions) + results['pred_n'] = calculate_precision_at_n(y_test, n_predictions, n=n) #measures of diversity - results['cat_diversity'] = calculate_population_category_diversity(y_predicted, content_array) - results['ils'] = calc_ils(y_predicted, content_array, num_partitions=num_partitions) + results['cat_diversity'] = calculate_population_category_diversity(n_predictions, content_array) + results['ils'] = calc_ils(n_predictions, content_array, num_partitions=num_partitions) #measures of coverage - results['cat_coverage'] = calculate_catalog_coverage(y_test, y_predicted) - results['item_coverage'] = calculate_item_coverage(y_test, y_predicted) - results['user_coverage'] = calculate_user_coverage(y_test, y_predicted) - results['pred_coverage'] = calculate_prediction_coverage(y_test, y_predicted) + results['cat_coverage'] = calculate_catalog_coverage(y_test, y_train, n_predictions) + results['item_coverage'] = calculate_item_coverage(y_test, y_train, n_predictions) + results['user_coverage'] = calculate_user_coverage(y_test, y_train, n_predictions) + results['pred_coverage'] = calculate_prediction_coverage(y_test, n_predictions) #measures of serendipity - results['serendipity'] = calculate_serendipity(y_train, y_test, y_predicted, rel_filter=1) - results['content_serendipity'] = calc_content_serendipity(y_test, y_predicted, content_array) + results['serendipity'] = calculate_serendipity(y_train, y_test, n_predictions, rel_filter=1) + results['content_serendipity'] = calc_content_serendipity(y_test, n_predictions, content_array) #measures of novelty - results['novelty'] = calculate_novelty(y_train, y_test, y_predicted) + results['novelty'] = calculate_novelty(y_train, y_test, n_predictions) #relevancy statistics - rel_stats = calc_relevant_rank_stats(y_test, y_predicted) + rel_stats = calc_relevant_rank_stats(y_test, n_predictions) results['avg_highest_rank'] = rel_stats[0] results['avg_mean_rank'] = rel_stats[1] results['avg_lowest_rank'] = rel_stats[2] @@ -295,16 +298,17 @@ def calc_user_ILS(item_list): -def calculate_catalog_coverage(y_actual, y_predicted): +def calculate_catalog_coverage(y_test, y_train, y_predicted): """ Calculates the percentage of user-item pairs that were predicted by the algorithm. - The test data is passed in to determine the total number of potential user-item pairs + The full data is passed in as y_test and y_train to determine the total number of potential user-item pairs Then the predicted data is passed in to determine how many user-item pairs were predicted. It is very important to NOT pass in the sorted and cut prediction RDD and that the algorithm trys to predict all pairs The use the function 'cartesian' as shown in line 25 of content_based.py is helpful in that regard Args: - y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + y_test: the data used to test the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + y_train: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] y_predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. It is important that this is not the sorted and cut prediction RDD @@ -313,49 +317,55 @@ def calculate_catalog_coverage(y_actual, y_predicted): """ + y_full_data = y_test.union(y_train) + prediction_count = y_predicted.count() #obtain the number of potential users and items from the actual array as the algorithms cannot predict something that was not trained - num_users = y_actual.map(lambda row: row[0]).distinct().count() - num_items = y_actual.map(lambda row: row[1]).distinct().count() + num_users = y_full_data.map(lambda row: row[0]).distinct().count() + num_items = y_full_data.map(lambda row: row[1]).distinct().count() potential_predict = num_users*num_items catalog_coverage = prediction_count/float(potential_predict)*100 return catalog_coverage -def calculate_item_coverage(y_actual, y_predicted): +def calculate_item_coverage(y_test, y_train, y_predicted): """ Calculates the percentage of users pairs that were predicted by the algorithm. - The test data is passed in to determine the total number of potential items + The full dataset is passed in as y_test and y_train to determine the total number of potential items Then the predicted data is passed in to determine how many users pairs were predicted. It is very important to NOT pass in the sorted and cut prediction RDD Args: - y_actual: actual ratings in the format of an array of [ (userId, itemId, actualRating) ] + y_test: the data used to test the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + y_train: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] y_predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. It is important that this is not the sorted and cut prediction RDD Returns: - user_coverage: value representing the percentage of user ratings that were able to be predicted + item_coverage: value representing the percentage of user ratings that were able to be predicted """ + y_full_data = y_test.union(y_train) + predicted_items = y_predicted.map(lambda row: row[1]).distinct().count() #obtain the number of potential users and items from the actual array as the algorithms cannot predict something that was not trained - num_items = y_actual.map(lambda row: row[1]).distinct().count() + num_items = y_full_data.map(lambda row: row[1]).distinct().count() item_coverage = predicted_items/float(num_items)*100 return item_coverage -def calculate_user_coverage(y_actual, y_predicted): +def calculate_user_coverage(y_test, y_train, y_predicted): """ Calculates the percentage of users that were predicted by the algorithm. - The test data is passed in to determine the total number of potential users + The full dataset is passed in as y_test and y_train to determine the total number of potential users Then the predicted data is passed in to determine how many users pairs were predicted. It is very important to NOT pass in the sorted and cut prediction RDD Args: - y_actual: actual ratings in the format of an array of [ (userId, itemId, actualRating) ] + y_test: the data used to test the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + y_train: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] y_predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. It is important that this is not the sorted and cut prediction RDD @@ -363,10 +373,11 @@ def calculate_user_coverage(y_actual, y_predicted): user_coverage: value representing the percentage of user ratings that were able to be predicted """ + y_full_data = y_test.union(y_train) predicted_users = y_predicted.map(lambda row: row[0]).distinct().count() #obtain the number of potential users and items from the actual array as the algorithms cannot predict something that was not trained - num_users = y_actual.map(lambda row: row[0]).distinct().count() + num_users = y_full_data.map(lambda row: row[0]).distinct().count() user_coverage = predicted_users/float(num_users)*100 diff --git a/src/data_prep/osm_vectoize.py b/src/data_prep/osm_vectoize.py index 7d27380..9bea71c 100644 --- a/src/data_prep/osm_vectoize.py +++ b/src/data_prep/osm_vectoize.py @@ -21,10 +21,13 @@ def __init__(self, user_interactions, user_vector_type, content_vector_type, sql self.content_vector_type = content_vector_type self.sqlCtx = sqlCtx - #Filter out uninteresting articles and users if they still exist in the dataset + #Filter out uninteresting items and users if they still exist in the dataset self.user_interactions =user_interactions self.user_interactions.registerTempTable("osm_data") + filtered = self.sqlCtx.sql("select * from osm_data where id is not Null and uid is not Null") + filtered.registerTempTable("filtered_osm") + #if no support files were passed in, initialize an empty support file if support_files: self.support_files = support_files @@ -35,16 +38,18 @@ def __init__(self, user_interactions, user_vector_type, content_vector_type, sql def get_user_vector(self): if self.user_vector_type=='ratings': - user_info = self.sqlCtx.sql("select user, id, count(1) as rating from filtered_users group by user, id") + user_info = self.sqlCtx.sql("select uid, id, count(1) as rating from filtered_osm group by uid, id")\ + .map(lambda (user, item, interact):(int(user), int(item), interact)) return user_info elif self.user_vector_type=='any_interact': - user_info = self.user_interactions.map(lambda row: (row.user_id, row.movie_id, row.rating) ).filter(lambda (u,m,r): r>3) + user_info = self.sqlCtx.sql("select uid, id, 1 as rating from filtered_osm group by uid, id")\ + .map(lambda (user, item, interact):(int(user), int(item), interact)) return user_info elif self.user_vector_type=='num_edits_ceil': - user_info = self.sqlCtx.sql("select user, id, count(1) as rating from osm_data group by user, id") \ - .map(lambda (user, id_, rating) : (user, id_, min(rating, 5))) + user_info = self.sqlCtx.sql("select uid, id, count(1) as rating from filtered_osm group by uid, id") \ + .map(lambda (user, item, interact) : (user, int(item), min(interact, 5))) return user_info elif self.user_vector_type=='none': @@ -57,7 +62,8 @@ def get_user_vector(self): def get_content_vector(self): if self.content_vector_type=='tags_only': - content_array = self.content.map(lambda row: (row.movie_id, osm_vectorize(row))) + content_array = self.content.map(lambda row: (row.id, osm_vectorize(row)))\ + .groupByKey().map(lambda (id, vectors): (id, np.array(list(vectors)).max(axis=0))) return content_array elif self.content_vector_type=='none': From 73128a903994cb25175a67a762b709d7e5dd14ab Mon Sep 17 00:00:00 2001 From: bethke Date: Wed, 27 Jan 2016 09:59:41 -0800 Subject: [PATCH 08/39] pass in sqlCtx for performance metrics --- src/algorithms/performance_metrics.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/algorithms/performance_metrics.py b/src/algorithms/performance_metrics.py index bcd0ac1..5a347a6 100644 --- a/src/algorithms/performance_metrics.py +++ b/src/algorithms/performance_metrics.py @@ -13,15 +13,15 @@ from sklearn.metrics import jaccard_similarity_score import itertools -def get_perform_metrics(y_test, y_train, y_predicted, content_array, n=100, num_partitions=30): +def get_perform_metrics(y_test, y_train, y_predicted, content_array, sqlCtx, num_predictions=100, num_partitions=30): results = {} #because some of the algorithms we will use will only return n predictions per user all results should be analyazed for n recommendations - n_predictions = predictions_to_n(y_predicted, n=n) + n_predictions = predictions_to_n(y_predicted, number_recommended=num_predictions) results['rmse'] = calculate_rmse_using_rdd(y_test, n_predictions) results['mae'] = calculate_mae_using_rdd(y_test,n_predictions) - results['pred_n'] = calculate_precision_at_n(y_test, n_predictions, n=n) + results['pred_n'] = calculate_precision_at_n(y_test, n_predictions, number_recommended=num_predictions) #measures of diversity results['cat_diversity'] = calculate_population_category_diversity(n_predictions, content_array) @@ -34,14 +34,14 @@ def get_perform_metrics(y_test, y_train, y_predicted, content_array, n=100, num_ results['pred_coverage'] = calculate_prediction_coverage(y_test, n_predictions) #measures of serendipity - results['serendipity'] = calculate_serendipity(y_train, y_test, n_predictions, rel_filter=1) - results['content_serendipity'] = calc_content_serendipity(y_test, n_predictions, content_array) + results['serendipity'] = calculate_serendipity(y_train, y_test, n_predictions, sqlCtx, rel_filter=1) + results['content_serendipity'] = calc_content_serendipity(y_test, n_predictions, content_array, sqlCtx) #measures of novelty - results['novelty'] = calculate_novelty(y_train, y_test, n_predictions) + results['novelty'] = calculate_novelty(y_train, y_test, n_predictions, sqlCtx) #relevancy statistics - rel_stats = calc_relevant_rank_stats(y_test, n_predictions) + rel_stats = calc_relevant_rank_stats(y_test, n_predictions, sqlCtx) results['avg_highest_rank'] = rel_stats[0] results['avg_mean_rank'] = rel_stats[1] results['avg_lowest_rank'] = rel_stats[2] @@ -409,7 +409,7 @@ def calculate_prediction_coverage(y_actual, y_predicted): return prediction_coverage -def calculate_serendipity(y_train, y_test, y_predicted, rel_filter=1): +def calculate_serendipity(y_train, y_test, y_predicted, sqlCtx, rel_filter=1): """ Calculates the serendipity of the recommendations. This measure of serendipity in particular is how surprising relevant recommendations are to a user @@ -504,7 +504,7 @@ def calculate_serendipity(y_train, y_test, y_predicted, rel_filter=1): return (average_overall_serendipity, average_serendipity) -def calculate_novelty(y_train, y_test, y_predicted): +def calculate_novelty(y_train, y_test, y_predicted, sqlCtx): """ Novelty measures how new or unknown recommendations are to a user An individual item's novelty can be calculated as the log of the popularity of the item @@ -570,7 +570,7 @@ def prob_by_rank(rank, n): prob = (n-rank)/float(n-1) return prob -def calc_content_serendipity(y_actual, y_predicted, content_array): +def calc_content_serendipity(y_actual, y_predicted, content_array, sqlCtx): """ Calculates the serendipity of the recommendations based on their content. This measure of serendipity in particular is how surprising relevant recommendations are to a user @@ -656,7 +656,7 @@ def calc_jaccard_diff(array_1, array_2): #otherwise a numpy float is returned which causes chaos and havoc to ensue return float(dist) -def calc_relevant_rank_stats(y_actual, y_predicted): +def calc_relevant_rank_stats(y_actual, y_predicted, sqlCtx): """ Determines the average minimum, average and maximum ranking of 'relevant' items 'Relevant' here means that the item was rated, i.e., it exists in the y_actual RDD From 9625d0ab8f342748195859e6b2cd1cef8f856082 Mon Sep 17 00:00:00 2001 From: Alexander Gude Date: Thu, 28 Jan 2016 11:37:08 -0800 Subject: [PATCH 09/39] Fix a bug with non-ascii characters in names When converting a git repo to JSON, non-ascii characters (especially non-ascii and non-UTF8 characters) cause a crash. Since these characters are not needed (we do not need an exact name, just a unique name for each entity) we discard them. --- src/utils/code_etl/blame_to_json.py | 7 +++++-- src/utils/code_etl/user_to_file_mapper.py | 17 +++++++++++++++-- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/src/utils/code_etl/blame_to_json.py b/src/utils/code_etl/blame_to_json.py index b7efa60..74aefd1 100755 --- a/src/utils/code_etl/blame_to_json.py +++ b/src/utils/code_etl/blame_to_json.py @@ -114,14 +114,17 @@ def clean_email(email): return email.strip("<>") # Function to convert timezone to hour office integer def tz_int(tz): return int(tz, 10) + # Function to clean out non-ascii characters + def clean_text(text): return ''.join([i if ord(i) < 128 else '' for i in text]) + # Translation from the porcelain key to the key in our JSON object, as well # as an option transformation to apply first porcelain_to_json = { - "author": ("author", None), + "author": ("author", clean_text), "author-mail": ("author_mail", clean_email), "author-time": ("author_time", int), "author-tz": ("author_timezone", tz_int), - "committer": ("committer", None), + "committer": ("committer", clean_text), "committer-mail": ("committer_mail", clean_email), "committer-time": ("committer_time", int), "committer-tz": ("committer_timezone", tz_int), diff --git a/src/utils/code_etl/user_to_file_mapper.py b/src/utils/code_etl/user_to_file_mapper.py index 150a5f0..f013330 100755 --- a/src/utils/code_etl/user_to_file_mapper.py +++ b/src/utils/code_etl/user_to_file_mapper.py @@ -142,6 +142,19 @@ def parse_block(block, file_map): file_map[file] = [(name, email)] +def clean_text(text): + """ Remove non-ascii characters from a string. + + Args: + text (str): A string. + + Returns: + str: A string with all characters with ord() >= 128 removed. + + """ + return ''.join([i if ord(i) < 128 else '' for i in text]) + + def file_map_to_json(file_map, repo_name): """Returns a list of JSON objects as strings containing the `git log` information. @@ -160,8 +173,8 @@ def file_map_to_json(file_map, repo_name): for key, count in counter.iteritems(): current_json = deepcopy(JSON_LINE) current_json["repo_name"] = repo_name - current_json["author"] = key[0] - current_json["author_mail"] = key[1] + current_json["author"] = clean_text(key[0]) + current_json["author_mail"] = clean_text(key[1]) current_json["filename"] = file current_json["edit_count"] = count jsons.append(json.dumps(current_json)) From 950ee230087d9c2665172084ca436330443105e3 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Tue, 1 Dec 2015 17:20:19 -0800 Subject: [PATCH 10/39] implemented framework but with a hardcoded recommender and metric create setup.py for hermes implement state machine, not yet tested test state machine to work test click to work in groups, might not use it in the future pass path correctly to state add logger and implement part of start_state, wip working start_state and json_to_rdd_state state machine done and tested rmse based on cf_example Write script create_file_containing_paths.py because click can only take one nargs=-1 input. able to take in multiple json files and its respective schemas pass in a file that contains paths to json files pass in a file that contains paths to schemas files add utils's book_crossing_etl and lastfm_etl --- .gitignore | 2 +- README.md | 53 +++++- docs/installation.md | 87 +++++++++ hermes/__init__.py | 2 + {src => hermes}/examples/cf_example.py | 10 +- hermes/hermes.py | 167 ++++++++++++++++++ hermes/hermes_ui.py | 135 ++++++++++++++ hermes/hermesctl.py | 114 ++++++++++++ hermes/metrics/__init__.py | 0 .../metrics}/content_based.py | 0 .../metrics}/performance_metrics.py | 0 hermes/modules/__init__.py | 0 hermes/modules/cargo.py | 26 +++ hermes/modules/datum.py | 22 +++ {src/examples => hermes/modules}/singleton.py | 0 hermes/modules/statemachine.py | 36 ++++ {src/examples => hermes/modules}/timer.py | 0 hermes/utils/__init__.py | 0 {src => hermes}/utils/article_to_category.py | 0 .../utils/book_crossing_etl/README.md | 0 .../utils/book_crossing_etl/bookcrossing.py | 10 +- {src => hermes}/utils/clean_categories.py | 0 {src => hermes}/utils/clean_links.py | 0 .../utils/code_etl/blame_to_json.py | 0 {src => hermes}/utils/code_etl/cd.py | 0 {src => hermes}/utils/code_etl/git_manager.py | 0 .../utils/code_etl/repo_to_json.py | 0 .../utils/code_etl/user_to_file_mapper.py | 0 .../utils/content_vector_tf_idf.py | 0 {src => hermes}/utils/glove.py | 0 {src => hermes}/utils/jester_etl/README.md | 0 {src => hermes}/utils/jester_etl/jester.py | 0 {src => hermes}/utils/lastfm_etl/README.md | 0 {src => hermes}/utils/lastfm_etl/lastfm.py | 12 +- .../utils/movielens_etl/ml10m_to_json.py | 0 .../utils/movielens_etl/ml1m_to_json.py | 0 .../utils/movielens_etl/ml20m_to_json.py | 0 .../utils/movielens_etl/movielens.py | 0 {src => hermes}/utils/remove_templates.py | 0 {src => hermes}/utils/wiki_categories.py | 0 {src => hermes}/utils/xml_to_json.py | 0 requirements.txt | 28 +++ scripts/create_file_containing_paths.py | 81 +++++++++ scripts/list_requirements.sh | 1 + scripts/run_once.sh | 2 + setup.py | 76 ++++++++ tests/__init__.py | 0 tests/test_hermes.py | 0 48 files changed, 844 insertions(+), 20 deletions(-) create mode 100644 docs/installation.md create mode 100644 hermes/__init__.py rename {src => hermes}/examples/cf_example.py (98%) create mode 100644 hermes/hermes.py create mode 100644 hermes/hermes_ui.py create mode 100644 hermes/hermesctl.py create mode 100644 hermes/metrics/__init__.py rename {src/algorithms => hermes/metrics}/content_based.py (100%) rename {src/algorithms => hermes/metrics}/performance_metrics.py (100%) create mode 100644 hermes/modules/__init__.py create mode 100644 hermes/modules/cargo.py create mode 100644 hermes/modules/datum.py rename {src/examples => hermes/modules}/singleton.py (100%) create mode 100644 hermes/modules/statemachine.py rename {src/examples => hermes/modules}/timer.py (100%) create mode 100644 hermes/utils/__init__.py rename {src => hermes}/utils/article_to_category.py (100%) rename {src => hermes}/utils/book_crossing_etl/README.md (100%) rename {src => hermes}/utils/book_crossing_etl/bookcrossing.py (96%) rename {src => hermes}/utils/clean_categories.py (100%) rename {src => hermes}/utils/clean_links.py (100%) rename {src => hermes}/utils/code_etl/blame_to_json.py (100%) rename {src => hermes}/utils/code_etl/cd.py (100%) rename {src => hermes}/utils/code_etl/git_manager.py (100%) rename {src => hermes}/utils/code_etl/repo_to_json.py (100%) rename {src => hermes}/utils/code_etl/user_to_file_mapper.py (100%) rename {src => hermes}/utils/content_vector_tf_idf.py (100%) rename {src => hermes}/utils/glove.py (100%) rename {src => hermes}/utils/jester_etl/README.md (100%) rename {src => hermes}/utils/jester_etl/jester.py (100%) rename {src => hermes}/utils/lastfm_etl/README.md (100%) rename {src => hermes}/utils/lastfm_etl/lastfm.py (93%) rename {src => hermes}/utils/movielens_etl/ml10m_to_json.py (100%) rename {src => hermes}/utils/movielens_etl/ml1m_to_json.py (100%) rename {src => hermes}/utils/movielens_etl/ml20m_to_json.py (100%) rename {src => hermes}/utils/movielens_etl/movielens.py (100%) rename {src => hermes}/utils/remove_templates.py (100%) rename {src => hermes}/utils/wiki_categories.py (100%) rename {src => hermes}/utils/xml_to_json.py (100%) create mode 100644 requirements.txt create mode 100644 scripts/create_file_containing_paths.py create mode 100755 scripts/list_requirements.sh create mode 100755 scripts/run_once.sh create mode 100644 setup.py create mode 100644 tests/__init__.py create mode 100644 tests/test_hermes.py diff --git a/.gitignore b/.gitignore index 3270149..c1f7ebd 100644 --- a/.gitignore +++ b/.gitignore @@ -112,7 +112,7 @@ lib64 __pycache__ # The __init__.py's that scram puts everywhere -__init__.py +# __init__.py # Installer logs pip-log.txt diff --git a/README.md b/README.md index 930925b..80f6c10 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,53 @@ -# hermes +# Hermes + Hermes is Lab41's foray into recommender systems. It explores how to choose a recommender system for a new application by analyzing the performance of multiple recommender system algorithms on a variety of datasets. -It also explores how recommender systems may assist a software developer of data scientist find new data, tools, and computer programs. +It also explores how recommender systems may assist a software developer or a data scientist to find new data, tools, and computer programs. + +This readme will be updated as the project progresses so stay tuned! + + +## Documentation + +[Hermes Documentation](https://github.com/Lab41/hermes/tree/master/docs) + + +## Basic Installation Guide + +For a detailed installation guide, please read on [Hermes Installation Guide](https://github.com/Lab41/hermes/tree/master/docs/installation.txt). + +### Dependencies: +* Spark 1.5.1 +* Scala 2.11.7 +* Pyspark 0.8.2.1 +* Hadoop 2.7.1 +* virtualenv + +### How to Install Hermes: + +(Optional) After you have installed the dependencies, if you have different projects that require different Python environment, you can use a Virtual Environment. As listed in the Virtual Environment's [site](http://docs.python-guide.org/en/latest/dev/virtualenvs/), "a Virtual Environment is a tool to keep the dependencies required by different projects in separate places, by creating virtual Python environments for them." + +```bash +$ virtualenv name_of_your_virtualenv +$ . name_of_your_virtualenv/bin/activate +``` + +To install Hermes, run +```bash +$ python setup.py install +``` + +This will create a binary called hermes in /usr/local/bin/hermes. Instead of running the binary with the entire path (ie. ./usr/local/bin/hermes), you can install it so that you can run hermes without calling the entire path on the command line. +```bash +$ pip install --editable . +``` + +Now, you can just run hermes the binary and it will prompt you with what you want to do with the data that you have. +```bash +$ hermes +``` + + +## State of Build -This readme will be updated as the project progresses so stay tuned! \ No newline at end of file +It is currently in progress. We will show the progress of the build using TravisCI once it is established. diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..2c559f3 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,87 @@ +## Hermes Installation Guide + +### Dependencies: +* Spark 1.5.1 +* Scala 2.11.7 +* Pyspark 0.8.2.1 +* Hadoop 2.7.1 +* virtualenv + +### How to Install Dependencies on Mac OS X: +#### Installing Spark, Scala, and PySpark +1. Install Java + 1. Download + 2. Double click on .dmg file to install. + 3. In a terminal, type java -version. You should see the following: +` +java version "1.8.0_65" +Java(TM) SE Runtime Environment (build 1.8.0_65-b17) +Java HotSpot(TM) 64-Bit Server VM (build 25.65-b01, mixed mode) +` +2. Set JAVA_HOME +export JAVA_HOME=$(/usr/libexec/java_home) + +3. Install Homebrew +` +ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" +` + +4. Install Scala +` +brew install scala +` + +5. Download Spark from https://spark.apache.org/downloads.html. + +6. Set SCALA_HOME and SPARK_HOME and export it to path in your .bash_profile. +` +export SPARK_HOME=/path/to/your/spark +export PATH=$PATH:$SPARK_HOME/bin +export SCALA_HOME=/path/to/your/scala +export PATH=$PATH:$SCALA_HOME/bin +` + +7. Export PySpark classes to the Python path after you have installed Python. +` +export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH +` + +8. Build and install Apache Spark +` +brew install sbt +cd $SPARK_HOME +sbt/sbt clean assembly +` + +#### Installing Hadoop +Please follow this [guide](http://zhongyaonan.com/hadoop-tutorial/setting-up-hadoop-2-6-on-mac-osx-yosemite.html). + +#### Installing virtualenv +Please read this [guide](http://docs.python-guide.org/en/latest/dev/virtualenvs/) for more details. +` +pip install virtualenv +` + +### How to Install Hermes: + +(Optional) After you have installed the dependencies, if you have different projects that require different Python environment, you can use a Virtual Environment. As listed in the Virtual Environment's [site](http://docs.python-guide.org/en/latest/dev/virtualenvs/), "a Virtual Environment is a tool to keep the dependencies required by different projects in separate places, by creating virtual Python environments for them." + +```bash +$ virtualenv name_of_your_virtualenv +$ . name_of_your_virtualenv/bin/activate +``` + +To install Hermes, run +```bash +$ python setup.py install +``` + +This will create a binary called hermes in /usr/local/bin/hermes. Instead of running the binary with the entire path (ie. ./usr/local/bin/hermes), you can install it so that you can run hermes automatically on the command line. +```bash +$ pip install --editable . +``` + +Now, you can just run hermes the binary and it will prompt you with what you want to do with the data that you have. +```bash +$ hermes +``` \ No newline at end of file diff --git a/hermes/__init__.py b/hermes/__init__.py new file mode 100644 index 0000000..61a202c --- /dev/null +++ b/hermes/__init__.py @@ -0,0 +1,2 @@ +import hermes +__version__ = '1.0' \ No newline at end of file diff --git a/src/examples/cf_example.py b/hermes/examples/cf_example.py similarity index 98% rename from src/examples/cf_example.py rename to hermes/examples/cf_example.py index d18d4fe..bb408a8 100644 --- a/src/examples/cf_example.py +++ b/hermes/examples/cf_example.py @@ -18,12 +18,12 @@ from sklearn.cross_validation import train_test_split from sklearn.cross_validation import StratifiedShuffleSplit -sys.path.append("../algorithms") +sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/" + "..")) -import performance_metrics as pm -import content_based as cb -from singleton import SCSingleton -from timer import Timer +import metrics.performance_metrics as pm +import metrics.content_based as cb +from modules.singleton import SCSingleton +from modules.timer import Timer """ This entire file is to provide a basic understanding of collaborative filtering diff --git a/hermes/hermes.py b/hermes/hermes.py new file mode 100644 index 0000000..959aae6 --- /dev/null +++ b/hermes/hermes.py @@ -0,0 +1,167 @@ +"""Helper functions to hermesctl.py""" + +import json +import os +from pyspark.sql.types import StructType + +import hermes_ui +import metrics.performance_metrics +import modules.datum +import modules.timer + +# TODO: empty certain items in cargo after no longer needed? +# TODO: when to use error_state? do try-catch for all states? + +def start_state(cargo): + """Start of the state machine. Create HDFS directory and upload the input data. + Returns: json_to_rdd_state as next state + """ + + if cargo.verbose: cargo.logger.debug("In start_state:") + + if (len(cargo.json_paths) != len(cargo.schema_paths)) and (len(cargos.schema_paths) > 0): + cargo.error_msg = "Each JSON file does not have its respective schema file." + newState = error_state + return newstate, cargo + + if cargo.verbose: cargo.logger.debug("Creating the hdfs directory " + cargo.hdfs_dir) + os.system("hdfs dfs -mkdir " + cargo.hdfs_dir) + + for i in range(0, len(cargo.json_paths)): + json_path = cargo.json_paths[i] + if cargo.verbose: cargo.logger.debug("Loading JSON file " + json_path + " into hdfs directory " + cargo.hdfs_dir) + os.system("hdfs dfs -put " + json_path + " " + cargo.hdfs_dir + "/" + os.path.basename(json_path)) + + newState = json_to_rdd_state + if cargo.verbose: cargo.logger.debug("start_state -> json_to_rdd_state") + + return newState, cargo + +def json_to_rdd_state(cargo): + """Parse JSON to RDD. + Returns: split_data_state as next state + """ + + if cargo.verbose: cargo.logger.debug("In json_to_rdd_state:") + + num_json_files = len(cargo.json_paths) + num_schema_files = len(cargo.schema_paths) + + # load schema files + schemas = [] + for i in range(0, num_schema_files): + schema_path = cargo.schema_paths[i] + if not schema_path: + # no schema for its respective json file + schemas.append(None) + else: + if cargo.verbose: cargo.logger.debug("Loading schema file %s" % schema_path) + with open(schema_path, "r") as schema_file: + schema = StructType.fromJson(json.load(schema_file)) + schemas.append(schema) + + # create RDD for each JSON file and store it in a Datum object + datums = [] + for i in range(0, num_json_files): + json_path = cargo.json_paths[i] + schema_path = cargo.schema_paths[i] + try: + schema = schemas[i] + except IndexError: + schema = None + + if cargo.verbose: cargo.logger.debug("Creating dataframe based on the content of the json file %s" % json_path) + dataframe = cargo.scsingleton.sqlCtx.read.json("hdfs://" + cargo.fs_default_ip_addr + "/" + cargo.hdfs_dir + "/" + os.path.basename(json_path), schema=schema) + # explicitly repartition RDD after loading so that more tasks can run on it in parallel + # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster + dataframe = dataframe.repartition(cargo.scsingleton.sc.defaultParallelism * 3) + + if schema is None: + schema = dataframe.schema + + rdd_format = hermes_ui._ask_user_for_rdd_format(schema_path, schema.names) + + if cargo.verbose: cargo.logger.debug("Creating RDD based on the format given by the user for json file %s" % json_path) + rdd = dataframe.map(lambda row: tuple(row[i] for i in rdd_format)).cache() + + if cargo.verbose: cargo.logger.debug("Storing RDD in Datum object for json file %s" % json_path) + datum = modules.datum.Datum(json_path, rdd) + datums.append(datum) + + cargo.datums = datums + + newState = split_data_state + if cargo.verbose: cargo.logger.debug("json_to_rdd_state -> split_data_state") + + return newState, cargo + +def split_data_state(cargo): + """Split data to train, test, and (optional) validate. + Returns: next state dependent whether or not it is using collaborative filtering or content based + """ + + if cargo.verbose: cargo.logger.debug("In split_data_state:") + + for i in range(0, len(cargo.datums)): + datum = cargo.datums[i] + weights, seed = hermes_ui._ask_user_for_split_percentage(datum.json_path) + datum.split_data(weights, seed) + + newState = develop_model_state + if cargo.verbose: cargo.logger.debug("split_data_state -> develop_model_state") + + return newState, cargo + +def develop_model_state(cargo): + """Develop model based on the train data. This model will be used to predict test data. + Returns: calculate_metrics_state as next state + """ + + if cargo.verbose: cargo.logger.debug("In develop_model_state:") + + for i in range(0, len(cargo.datums)): + datum = cargo.datums[i] + with modules.timer.Timer() as t: + # TODO: build model, please do not hardcode what to use for model + from pyspark.mllib.recommendation import ALS + cargo.model = ALS.train(datum.trainingRdd, rank=3) + if cargo.verbose: cargo.logger.debug("Building model takes %s seconds" % t.secs) + + + newState = calculate_metrics_state + if cargo.verbose: cargo.logger.debug("develop_model_state -> calculate_metrics_state") + + return newState, cargo + +def calculate_metrics_state(cargo): + """Test the metrics specified by the user. This is an end state. + Returns: None because this is the last state. + """ + + if cargo.verbose: cargo.logger.debug("In calculate_metrics_state:") + + for i in range(0, len(cargo.datums)): + datum = cargo.datums[i] + with modules.timer.Timer() as t: + # TODO: make a prediction, please do not hardcode what to do here + testPredRDD = cargo.model.predictAll( datum.testRdd.map( lambda x: (x[0], x[1]) ) ).cache() + if cargo.verbose: cargo.logger.debug("Making prediction takes %s seconds" % t.secs) + with modules.timer.Timer() as t: + # TODO: calculate metric, please do not hardcode what to use for metric + testRmse = metrics.performance_metrics.calculate_rmse_using_rdd(datum.testRdd, testPredRDD) + if cargo.verbose: cargo.logger.debug("Calculating metric takes %s seconds" % t.secs) + print "testRmse", testRmse + + if cargo.verbose: cargo.logger.debug("calculate_metrics_state -> end_state") + + return + +def error_state(cargo): + """Error state. Print out the error messages. This is an end state. + Returns: None because this is the last state. + """ + if cargo.verbose: cargo.logger.debug("In error_state:") + cargo.logger.error("ERROR: " + cargo.error_msg) + if cargo.verbose: cargo.logger.debug("error_state -> end_state") + return + diff --git a/hermes/hermes_ui.py b/hermes/hermes_ui.py new file mode 100644 index 0000000..aea8d41 --- /dev/null +++ b/hermes/hermes_ui.py @@ -0,0 +1,135 @@ +def _ask_user_for_rdd_format(schema_path, schema_names): + """Ask user for the desired RDD format. + Args: + schema_path: the path to the schema file + schema_names: + Returns: List of schema_name's id. + """ + print "How do you want your data to be parsed?" + print "For example: Given the following options" + print "(0) movie_id" + print "(1) rating" + print "(2) timestamp" + print "(3) user_id" + print "if you wanted the data to be parsed in the format of [(user_id, movie_id, rating)]," + print "please type in: 3 0 1\n" + + def _check_schema_ids(schema_ids, num_schema_ids): + + # check if each schema_name_id is in the range of num_schema_ids + for schema_name_id in schema_name_ids: + if schema_name_id not in range(0, num_schema_ids): + print "Option provided is not in range." + return False + + # check that there are no duplicates + if len(schema_name_ids) != len(set(schema_name_ids)): + print "There are duplicates. Please provide no duplicates." + return False + + return True + + + print "For the following given schema %s" % (schema_path) + print "how do you want your data to be parsed? " + for i in range(0, len(schema_names)): + print "(%s) %s" % (i, schema_names[i]) + + while True: + user_input = raw_input("Enter the numbers separated by blank space: ") + try: + schema_name_ids = [int(schema_name_id.strip()) for schema_name_id in user_input.split(" ")] + if _check_schema_ids(schema_name_ids, len(schema_names)): + break + except ValueError: + print "Please provide a valid number." + + return schema_name_ids + +def _ask_user_for_split_percentage(datum_json_path): + """Ask user what percentage to split the data into training, test, and validation. + Args: + datum_json_path: the path to the data JSON file + Returns: Tuple of percentage of training, test, and validation respectively in float notation. + (trainingPercentage, testPercentage, validationPercentage), seed + """ + print "How do you want to split your data?" + print "For example: If you wanted to split the data into " + print "60\% training, 40\% test, 0\% validation, seed = 11, please type in:" + print "Percentage for training: 60" + print "Percentage for test: 40" + print "Percentage for validation: 0" + print "Seed: 11\n" + + + def _check_percentage(percentage): + """Check if the percentage is valid. + """ + if percentage in range(0, 100): + return True + else: + return False + + def _check_sum_percentage(a, b, c): + """Check if the sum of the given percentages is equal to 100. + """ + sum_percentage = a + b + c + if sum_percentage == 100: + return True + else: + return False + + print "For the following given data %s" % (datum_json_path) + print "how do you want to split your data?" + while True: + while True: + try: + trainingPercentage = int(raw_input("Percentage for training: ").strip()) + except ValueError: + print "Please provide a valid number." + else: + if _check_percentage(trainingPercentage): + break + else: + print "Please provide a number from 0 - 100." + while True: + try: + testPercentage = int(raw_input("Precentage for test: ").strip()) + except ValueError: + print "Please provide a valid number." + else: + if _check_percentage(testPercentage): + break + else: + print "Please provide a number from 0 - 100." + while True: + try: + validationPercentage = int(raw_input("Percentage for validation: ").strip()) + except ValueError: + print "Please provide a valid number." + else: + if _check_percentage(validationPercentage): + break + else: + print "Please provide a number from 0 - 100." + if _check_sum_percentage(trainingPercentage, testPercentage, validationPercentage): + break + else: + print "Sum of percentages does not equal to 100. Please re-input the percentages." + + while True: + try: + seed = int(raw_input("Seed: ").strip()) + break + except ValueError: + print "Please provide a valid number." + + # convert it to a percentage from 0 - 1 + trainingPercentage = trainingPercentage/100. + testPercentage = testPercentage/100. + validationPercentage = validationPercentage/100. + + return [trainingPercentage, testPercentage, validationPercentage], seed + + + diff --git a/hermes/hermesctl.py b/hermes/hermesctl.py new file mode 100644 index 0000000..753f46b --- /dev/null +++ b/hermes/hermesctl.py @@ -0,0 +1,114 @@ +"""Script to run hermes via command line.""" + +import click +import sys + +import hermes +from modules.cargo import Cargo +from modules.statemachine import StateMachine + + +def add_states(stateMachine): + """ json_to_rdd -> split_data - (Collaborative Filtering) -> develop_model -> calculate_metrics + - (Content Based) -> ??? + """ + stateMachine.add_state(hermes.start_state) + stateMachine.add_state(hermes.json_to_rdd_state) + stateMachine.add_state(hermes.split_data_state) + stateMachine.add_state(hermes.develop_model_state) + stateMachine.add_state(hermes.calculate_metrics_state, isEndState=1) + stateMachine.add_state(hermes.error_state, isEndState=1) + stateMachine.set_start(hermes.start_state) + return + +def create_logger(): + import logging + logger = logging.getLogger(__name__) + logger.setLevel(logging.DEBUG) + # create file handler which logs even debug messages + fh = logging.FileHandler("hermes.log") + fh.setLevel(logging.DEBUG) + # create console handler for stderr with a higher log level + che = logging.StreamHandler() + che.setLevel(logging.ERROR) + # create console handler for stdout for info, debug, and error level + choi = logging.StreamHandler(sys.stdout) + choi.setLevel(logging.INFO) + chod = logging.StreamHandler(sys.stdout) + chod.setLevel(logging.DEBUG) + choe = logging.StreamHandler(sys.stdout) + choe.setLevel(logging.ERROR) + # create formatter and add it to the handlers + formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + fh.setFormatter(formatter) + che.setFormatter(formatter) + choi.setFormatter(formatter) + chod.setFormatter(formatter) + choe.setFormatter(formatter) + # add handlers to logger + logger.addHandler(fh) + logger.addHandler(che) + logger.addHandler(choi) + logger.addHandler(chod) + logger.addHandler(choe) + return logger + +def create_sparkcontext(): + from pyspark import SparkConf + from modules.singleton import SCSingleton + conf = SparkConf().setAppName("hermes") + return SCSingleton(conf) + +def extract_paths(file_with_paths): + return [line.rstrip("\n") for line in open(file_with_paths)] + +def print_version(ctx, param, value): + """Print the current version of hermes and exit.""" + if not value: + return + import pkg_resources + version = None + try: + version = pkg_resources.get_distribution("hermes").version + finally: + del pkg_resources + click.echo(version) + ctx.exit() + +@click.command() +@click.option("--verbose", is_flag=True, \ + help="Print debug messages") +@click.option("--version", callback=print_version, is_flag=True, expose_value=False, is_eager=True, \ + help="Display hermes's version number") +@click.option("--hdfs_dir", default="datasets", \ + help="Name of HDFS directory to store input data.") +# IP address of fs.default.name used in HDFS +@click.argument("fs_default_ip_addr", default="localhost:9000") +# Path to a file that lists JSON files. +@click.argument("file_to_json_paths", type=click.Path(exists=True), nargs=1) +@click.option("--schemas", type=click.Path(exists=True), nargs=1, \ + help="Path to a file that lists each JSON file's schema.") +def main(verbose, hdfs_dir, fs_default_ip_addr, file_to_json_paths, schemas): + """Hermes allows you to run multiple recommender system metrics on your chosen dataset.""" + + # create state machine + stateMachine = StateMachine() + add_states(stateMachine) + + # create cargo + cargo = Cargo() + + # add items to cargo + cargo.scsingleton = create_sparkcontext() + cargo.logger = create_logger() + cargo.verbose = verbose + cargo.hdfs_dir = hdfs_dir + cargo.fs_default_ip_addr = fs_default_ip_addr + cargo.json_paths = extract_paths(file_to_json_paths) + cargo.schema_paths = extract_paths(schemas) + + # run state machine + stateMachine.run(cargo) + + + diff --git a/hermes/metrics/__init__.py b/hermes/metrics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/algorithms/content_based.py b/hermes/metrics/content_based.py similarity index 100% rename from src/algorithms/content_based.py rename to hermes/metrics/content_based.py diff --git a/src/algorithms/performance_metrics.py b/hermes/metrics/performance_metrics.py similarity index 100% rename from src/algorithms/performance_metrics.py rename to hermes/metrics/performance_metrics.py diff --git a/hermes/modules/__init__.py b/hermes/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hermes/modules/cargo.py b/hermes/modules/cargo.py new file mode 100644 index 0000000..456b2cf --- /dev/null +++ b/hermes/modules/cargo.py @@ -0,0 +1,26 @@ +class Cargo(object): + """Cargo contains objects that are passed around in the state machine. + + Args: + scsingleton: Spark Context. There can only be one scsingleton running. + logger: + verbose: + hdfs_dir: + fs_default_ip_addr: + json_paths: + schema_path: + schema: + error_msg: + """ + def __init__(self): + self.scsingleton = None + self.logger = None + self.verbose = False + self.hdfs_dir = None + self.fs_default_ip_addr = None + self.json_paths = [] + self.schema_paths = [] + self.datums = [] + self.model = None + self.error_msg = "" + diff --git a/hermes/modules/datum.py b/hermes/modules/datum.py new file mode 100644 index 0000000..8cfb8f0 --- /dev/null +++ b/hermes/modules/datum.py @@ -0,0 +1,22 @@ + +class Datum(object): + """Datum is a single data being subjected to + recommender system algorithms and performance metrics. + """ + + def __init__(self, json_path, rdd): + self.json_path = json_path + self.rdd = rdd + self.trainingRdd = None + self.testRdd = None + self.validationRdd = None + + def split_data(self, weights, seed): + trainingRdd, testRdd, validationRdd = self.rdd.randomSplit(weights, seed) + self.trainingRdd = trainingRdd.cache() + self.testRdd = testRdd.cache() + self.validationRdd = validationRdd.cache() + + + + diff --git a/src/examples/singleton.py b/hermes/modules/singleton.py similarity index 100% rename from src/examples/singleton.py rename to hermes/modules/singleton.py diff --git a/hermes/modules/statemachine.py b/hermes/modules/statemachine.py new file mode 100644 index 0000000..5afdd90 --- /dev/null +++ b/hermes/modules/statemachine.py @@ -0,0 +1,36 @@ +class InitializationError(Exception): pass + +class StateMachine: + def __init__(self): + self.handlers = [] + self.startState = None + self.endStates = [] + + def add_state(self, handler, isEndState=0): + self.handlers.append(handler) + if isEndState: + self.endStates.append(handler) + + def set_start(self, handler): + self.startState = handler + + def run(self, cargo=None): + if not self.startState: + raise InitializationError("Must call .set_start() before .run()") + if not self.endStates: + raise InitializationError("Must call .set_start() before .run()") + + handler = self.startState + + while True: + (newState, cargo) = handler(cargo) + if newState in self.endStates: + newState(cargo) + break + elif newState not in self.handlers: + print self.handlers + raise RuntimeError("Invalid state %s" % newState) + else: + handler = newState + + return self \ No newline at end of file diff --git a/src/examples/timer.py b/hermes/modules/timer.py similarity index 100% rename from src/examples/timer.py rename to hermes/modules/timer.py diff --git a/hermes/utils/__init__.py b/hermes/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/article_to_category.py b/hermes/utils/article_to_category.py similarity index 100% rename from src/utils/article_to_category.py rename to hermes/utils/article_to_category.py diff --git a/src/utils/book_crossing_etl/README.md b/hermes/utils/book_crossing_etl/README.md similarity index 100% rename from src/utils/book_crossing_etl/README.md rename to hermes/utils/book_crossing_etl/README.md diff --git a/src/utils/book_crossing_etl/bookcrossing.py b/hermes/utils/book_crossing_etl/bookcrossing.py similarity index 96% rename from src/utils/book_crossing_etl/bookcrossing.py rename to hermes/utils/book_crossing_etl/bookcrossing.py index d91a8ec..af806f4 100755 --- a/src/utils/book_crossing_etl/bookcrossing.py +++ b/hermes/utils/book_crossing_etl/bookcrossing.py @@ -217,7 +217,7 @@ def parse_book_line(line): ) parser.add_argument( '-o', - '--output-directory', + '--output_directory', type=str, action="store", help="the directory to save the output JSON files, by default the current directory", @@ -257,8 +257,8 @@ def parse_book_line(line): with\ open(args.ratings, 'rb') as csvfile,\ - open("book-crossing_implicit_ratings.json", 'w') as imp,\ - open("book-crossing_explicit_ratings.json", 'w') as exp: + open("implicit_ratings.json", 'w') as imp,\ + open("explicit_ratings.json", 'w') as exp: for line in iter_lines(csvfile): ret = parse_rating_line(line) @@ -275,11 +275,11 @@ def parse_book_line(line): # outputs. rated_and_valid_users = set(rated_users) - with open("book-crossing_books.json", 'w') as f: + with open("books.json", 'w') as f: for ret in book_data: f.write(json.dumps(ret) + '\n') - with open("book-crossing_users.json", 'w') as f: + with open("users.json", 'w') as f: for ret in users_data: if ret["user_id"] in rated_and_valid_users: f.write(json.dumps(ret) + '\n') diff --git a/src/utils/clean_categories.py b/hermes/utils/clean_categories.py similarity index 100% rename from src/utils/clean_categories.py rename to hermes/utils/clean_categories.py diff --git a/src/utils/clean_links.py b/hermes/utils/clean_links.py similarity index 100% rename from src/utils/clean_links.py rename to hermes/utils/clean_links.py diff --git a/src/utils/code_etl/blame_to_json.py b/hermes/utils/code_etl/blame_to_json.py similarity index 100% rename from src/utils/code_etl/blame_to_json.py rename to hermes/utils/code_etl/blame_to_json.py diff --git a/src/utils/code_etl/cd.py b/hermes/utils/code_etl/cd.py similarity index 100% rename from src/utils/code_etl/cd.py rename to hermes/utils/code_etl/cd.py diff --git a/src/utils/code_etl/git_manager.py b/hermes/utils/code_etl/git_manager.py similarity index 100% rename from src/utils/code_etl/git_manager.py rename to hermes/utils/code_etl/git_manager.py diff --git a/src/utils/code_etl/repo_to_json.py b/hermes/utils/code_etl/repo_to_json.py similarity index 100% rename from src/utils/code_etl/repo_to_json.py rename to hermes/utils/code_etl/repo_to_json.py diff --git a/src/utils/code_etl/user_to_file_mapper.py b/hermes/utils/code_etl/user_to_file_mapper.py similarity index 100% rename from src/utils/code_etl/user_to_file_mapper.py rename to hermes/utils/code_etl/user_to_file_mapper.py diff --git a/src/utils/content_vector_tf_idf.py b/hermes/utils/content_vector_tf_idf.py similarity index 100% rename from src/utils/content_vector_tf_idf.py rename to hermes/utils/content_vector_tf_idf.py diff --git a/src/utils/glove.py b/hermes/utils/glove.py similarity index 100% rename from src/utils/glove.py rename to hermes/utils/glove.py diff --git a/src/utils/jester_etl/README.md b/hermes/utils/jester_etl/README.md similarity index 100% rename from src/utils/jester_etl/README.md rename to hermes/utils/jester_etl/README.md diff --git a/src/utils/jester_etl/jester.py b/hermes/utils/jester_etl/jester.py similarity index 100% rename from src/utils/jester_etl/jester.py rename to hermes/utils/jester_etl/jester.py diff --git a/src/utils/lastfm_etl/README.md b/hermes/utils/lastfm_etl/README.md similarity index 100% rename from src/utils/lastfm_etl/README.md rename to hermes/utils/lastfm_etl/README.md diff --git a/src/utils/lastfm_etl/lastfm.py b/hermes/utils/lastfm_etl/lastfm.py similarity index 93% rename from src/utils/lastfm_etl/lastfm.py rename to hermes/utils/lastfm_etl/lastfm.py index 1b0c292..123bfbe 100755 --- a/src/utils/lastfm_etl/lastfm.py +++ b/hermes/utils/lastfm_etl/lastfm.py @@ -276,7 +276,7 @@ def parse_plays_line(line): ) parser.add_argument( '-o', - '--output-directory', + '--output_directory', type=str, action="store", help="the directory to save the output JSON files, by default the current directory", @@ -287,11 +287,11 @@ def parse_plays_line(line): # Parse the files processing_queue = ( - (args.artists, args.output_directory + "/lastfm_artists.json", parse_artist_line), - (args.tags, args.output_directory + "/lastfm_tags.json", parse_tag_line), - (args.friends, args.output_directory + "/lastfm_friends.json", parse_friends_line), - (args.applied_tags, args.output_directory + "/lastfm_applied_tags.json", parse_applied_tag_line), - (args.plays, args.output_directory + "/lastfm_plays.json", parse_plays_line), + (args.artists, args.output_directory + "/artists.json", parse_artist_line), + (args.tags, args.output_directory + "/tags.json", parse_tag_line), + (args.friends, args.output_directory + "/friends.json", parse_friends_line), + (args.applied_tags, args.output_directory + "/applied_tags.json", parse_applied_tag_line), + (args.plays, args.output_directory + "/plays.json", parse_plays_line), ) for input_file, output_file, function in processing_queue: with open(input_file, 'rb') as csv_file, open(output_file, 'w') as json_file: diff --git a/src/utils/movielens_etl/ml10m_to_json.py b/hermes/utils/movielens_etl/ml10m_to_json.py similarity index 100% rename from src/utils/movielens_etl/ml10m_to_json.py rename to hermes/utils/movielens_etl/ml10m_to_json.py diff --git a/src/utils/movielens_etl/ml1m_to_json.py b/hermes/utils/movielens_etl/ml1m_to_json.py similarity index 100% rename from src/utils/movielens_etl/ml1m_to_json.py rename to hermes/utils/movielens_etl/ml1m_to_json.py diff --git a/src/utils/movielens_etl/ml20m_to_json.py b/hermes/utils/movielens_etl/ml20m_to_json.py similarity index 100% rename from src/utils/movielens_etl/ml20m_to_json.py rename to hermes/utils/movielens_etl/ml20m_to_json.py diff --git a/src/utils/movielens_etl/movielens.py b/hermes/utils/movielens_etl/movielens.py similarity index 100% rename from src/utils/movielens_etl/movielens.py rename to hermes/utils/movielens_etl/movielens.py diff --git a/src/utils/remove_templates.py b/hermes/utils/remove_templates.py similarity index 100% rename from src/utils/remove_templates.py rename to hermes/utils/remove_templates.py diff --git a/src/utils/wiki_categories.py b/hermes/utils/wiki_categories.py similarity index 100% rename from src/utils/wiki_categories.py rename to hermes/utils/wiki_categories.py diff --git a/src/utils/xml_to_json.py b/hermes/utils/xml_to_json.py similarity index 100% rename from src/utils/xml_to_json.py rename to hermes/utils/xml_to_json.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6472b9a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,28 @@ +boto==2.36.0 +click==5.1 +cycler==0.9.0 +docopt==0.6.2 +hermes==1.0 +matplotlib==1.5.0 +mesos==0.25.0 +mesos.cli==0.25.0 +mesos.interface==0.25.0 +mesos.native==0.25.0 +numpy==1.10.1 +pandas==0.17.0 +pbr==1.8.1 +protobuf==2.6.1 +psutil==3.2.2 +py==1.4.30 +pyparsing==2.0.5 +pytest==2.8.2 +python-dateutil==2.4.2 +pytz==2015.7 +scikit-learn==0.17 +scipy==0.16.1 +six==1.10.0 +stevedore==1.9.0 +virtualenv==13.1.2 +virtualenv-clone==0.2.6 +virtualenvwrapper==4.7.1 +wheel==0.26.0 diff --git a/scripts/create_file_containing_paths.py b/scripts/create_file_containing_paths.py new file mode 100644 index 0000000..596997f --- /dev/null +++ b/scripts/create_file_containing_paths.py @@ -0,0 +1,81 @@ +"""Create json_paths.txt and schema_paths.txt that you can pass in to hermes. +Outputs: + 1. json_paths.txt: lists all path to JSON files used in hermes + 2. schema_paths.txt: lists all path to schema files used in hermes +""" + +import os +from distutils.util import strtobool + +def file_accessible(filepath, mode): + """Check if a file exists and is accessible.""" + try: + f = open(filepath, mode) + f.close() + except IOError as e: + return False + + return True + +def parse_yn(answer): + answer.upper().strip() + +def main(): + + # create output directory if it did not exist + output_dir = os.path.dirname(os.path.realpath(__file__)) + "/output" + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # ask user for path to JSON file and its respective schema file + i = 0 + json_paths = [] + schema_paths = [] + is_last = False + while True: + while True: + json_path = raw_input("(" + str(i) + ") Enter path to a JSON file: ") + if file_accessible(json_path.strip(), "r"): + json_paths.append(json_path) + break + else: + print "Please input a JSON file that exists or is accessible." + while True: + schema_path = raw_input("(" + str(i) + ") Enter path to its respective schema file (or empty string if there is no schema): ") + if file_accessible(schema_path.strip(), "r"): + schema_paths.append(schema_path) + break + elif schema_path.strip() == "": + schema_paths.append("") + break + else: + print "Please input a schema file that exists or is accessible." + while True: + add_more = raw_input("Do you need to add more JSON file? [Y/N] ") + try: + if bool(strtobool(add_more.upper().strip())): + i = i + 1 + else: + is_last = True + break + except ValueError: + print "Please respond with a Y or N." + if is_last: + break + + # create a file with a list of JSON file paths + json_file = output_dir + "/json_paths.txt" + with open(json_file, "w") as f: + for json_path in json_paths: + f.write(json_path + "\n") + + # create a file with a list of schema file paths + schema_file = output_dir + "/schema_paths.txt" + with open(schema_file, "w") as f: + for schema_path in schema_paths: + f.write(schema_path + "\n") + + return + +if __name__ == "__main__": + main() diff --git a/scripts/list_requirements.sh b/scripts/list_requirements.sh new file mode 100755 index 0000000..ef0028b --- /dev/null +++ b/scripts/list_requirements.sh @@ -0,0 +1 @@ +pip freeze > $PWD/../requirements.txt diff --git a/scripts/run_once.sh b/scripts/run_once.sh new file mode 100755 index 0000000..4278dc6 --- /dev/null +++ b/scripts/run_once.sh @@ -0,0 +1,2 @@ +python setup.py install +pip install --editable . diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4ce2f1e --- /dev/null +++ b/setup.py @@ -0,0 +1,76 @@ +from __future__ import print_function +from setuptools import setup, find_packages +from setuptools.command.test import test as TestCommand +import codecs +import os +import sys +import re + +here = os.path.abspath(os.path.dirname(__file__)) + +def read(*parts): + # intentionally *not* adding an encoding option to open + return codecs.open(os.path.join(here, *parts), 'r').read() + +def find_version(*file_paths): + version_file = read(*file_paths) + version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", + version_file, re.M) + if version_match: + return version_match.group(1) + raise RuntimeError("Unable to find version string.") + +long_description = read('README.md') + +class PyTest(TestCommand): + def finalize_options(self): + TestCommand.finalize_options(self) + self.test_args = ['--strict', '--verbose', '--tb=long', 'tests'] # tests if rename src to hermes + self.test_suite = True + + def run_tests(self): + import pytest + errno = pytest.main(self.test_args) + sys.exit(errno) + +setup( + name='hermes', + version=find_version('hermes', '__init__.py'), + url='http://github.com/lab41/hermes/', + license='Apache Software License', + author='Lab 41', + description='Exploration of Recommender Systems', + long_description=long_description, + tests_require=['pytest'], + install_requires=['click', + ], + cmdclass={'test': PyTest}, + entry_points={ + 'console_scripts': [ + 'hermes = hermes.hermesctl:main', + ], + }, + py_modules=['hermes'], + #scripts=['scripts/somescript.py'], + packages=['hermes', 'hermes.modules', 'hermes.metrics', 'hermes.utils'], + include_package_data=True, + platforms='any', + test_suite='tests.test_hermes.py', + zip_safe=False, + #package_data={'hermes': ['templates/**', 'static/*/*']}, + classifiers = [ + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Development Status :: 1', + 'Natural Language :: English', + 'Environment :: Spark Environment', + 'Intended Audience :: Developers, Data Scientists', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: MAC OS X', + 'Topic :: Recommender System', + ], + extras_require={ + 'testing': ['pytest'], + } +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_hermes.py b/tests/test_hermes.py new file mode 100644 index 0000000..e69de29 From 871242b2af191aeca087249141dfd2993f0bf5c2 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Thu, 10 Dec 2015 11:08:04 -0800 Subject: [PATCH 11/39] implemented a working base framework wip: rename hermes_ui to hermesui and get started on config and vector generator wip: not yet tested, implement factory model on vectorgenerator.py wip: not yet tested, implement configparser wip: configparser works until start_state wip: start_state working wip: fix important change to differentiate between uservectordata vs contentvectordata wip: work until develop_model_states wip: still in develop_model_state, add two parents for each vector in vectorgenerator wip: cleanup code a bit implemented base framework --- hermes/configs/config_1.ini | 19 +++ hermes/configs/list_of_files.ini | 17 +++ hermes/configs/my_config.ini | 16 +++ hermes/configs/my_list_of_files.ini | 10 ++ hermes/hermes.py | 182 +++++++++++-------------- hermes/hermesctl.py | 175 +++++++++++++++++++++--- hermes/{hermes_ui.py => hermesui.py} | 4 +- hermes/metrics/performance_metrics.py | 23 +--- hermes/modules/cargo.py | 17 +-- hermes/modules/config.py | 31 +++++ hermes/modules/data.py | 38 ++++++ hermes/modules/datum.py | 22 --- hermes/modules/helper.py | 13 ++ hermes/modules/metricgenerator.py | 61 +++++++++ hermes/modules/recommendergenerator.py | 121 ++++++++++++++++ hermes/modules/vectorgenerator.py | 147 ++++++++++++++++++++ 16 files changed, 726 insertions(+), 170 deletions(-) create mode 100644 hermes/configs/config_1.ini create mode 100644 hermes/configs/list_of_files.ini create mode 100644 hermes/configs/my_config.ini create mode 100644 hermes/configs/my_list_of_files.ini rename hermes/{hermes_ui.py => hermesui.py} (96%) create mode 100644 hermes/modules/config.py create mode 100644 hermes/modules/data.py delete mode 100644 hermes/modules/datum.py create mode 100644 hermes/modules/helper.py create mode 100644 hermes/modules/metricgenerator.py create mode 100644 hermes/modules/recommendergenerator.py create mode 100644 hermes/modules/vectorgenerator.py diff --git a/hermes/configs/config_1.ini b/hermes/configs/config_1.ini new file mode 100644 index 0000000..64fd6ef --- /dev/null +++ b/hermes/configs/config_1.ini @@ -0,0 +1,19 @@ +[meta] +# TODO: still has not handle meta +output_directory = /output/wikipedia_cbkmeans + +[datasets] +vectorizer = wiki +user_vector_data = ["edit_history"] +user_vector_types = ["num_edits_ceil"] +content_vector_data = ["full_text"] +content_vector_types = ["glove_model"] +glove_model = /data/glove/glove.txt + +[recommenders] +recommenders = ["CBWithKMeans"] +# TODO: still has not handle additional variables like top_n +top_n = 20 + +[metrics] +metrics = ["RMSE", "MAE"] \ No newline at end of file diff --git a/hermes/configs/list_of_files.ini b/hermes/configs/list_of_files.ini new file mode 100644 index 0000000..9c71011 --- /dev/null +++ b/hermes/configs/list_of_files.ini @@ -0,0 +1,17 @@ +[wiki] +# It looks like we will have to support globing, or else listing the Wikipedia +# data files is going to be a Herculean task +edit_history = /Users/tiffanyj/datasets/wikipedida/edits/*json.gz +full_text = /Users/tiffanyj/datasets/wikipedia/fulltext/*json.gz +[movielens] +# 20M data +movielens_20m_ratings=/data/ml/20m/ratings.json.gz +movielens_20m_tags=/data/ml/20m/tags.json.gz +movielens_20m_movies=/data/ml/20m/movies.json.gz +# 10M data +movielens_10m_ratings=/data/ml/10m/ratings.json.gz +movielens_10m_tags=/data/ml/10m/tags.json.gz +movielens_10m_movies=/data/ml/10m/movies.json.gz +# 1M data +movielens_1m_ratings=/data/ml/10m/ratings.json.gz +movielens_1m_movies=/data/ml/10m/movies.json.gz \ No newline at end of file diff --git a/hermes/configs/my_config.ini b/hermes/configs/my_config.ini new file mode 100644 index 0000000..5d5bc57 --- /dev/null +++ b/hermes/configs/my_config.ini @@ -0,0 +1,16 @@ +[datasets] +vectorizer = movielens +user_vector_data = ["movielens_10m_ratings"] +user_vector_schemas = ["movielens_10m_ratings_schema"] +user_vector_types = ["ratings"] +#content_vector_data = ["movielens_10m_movies"] +#content_vector_schemas = ["movielens_10m_movies_schema"] +#content_vector_types = ["genre"] + +[recommenders] +recommenders = ["ALS"] +#user_recommenders = ["ALS"] +#content_recommenders = [""] + +[metrics] +metrics = ["RMSE", "MAE"] \ No newline at end of file diff --git a/hermes/configs/my_list_of_files.ini b/hermes/configs/my_list_of_files.ini new file mode 100644 index 0000000..74ff57f --- /dev/null +++ b/hermes/configs/my_list_of_files.ini @@ -0,0 +1,10 @@ +[movielens] +# 10M data +movielens_10m_ratings = /Users/tiffanyj/datasets/movielens/movielens_10m_ratings.json.gz +movielens_10m_tags = /Users/tiffanyj/datasets/movielens/movielens_10m_tags.json.gz +movielens_10m_movies = /Users/tiffanyj/datasets/movielens/movielens_10m_movies.json.gz + +# 10M schema +movielens_10m_ratings_schema = /Users/tiffanyj/datasets/movielens/movielens_20m_ratings_schema.json +movielens_10m_tags_schema = /Users/tiffanyj/datasets/movielens/movielens_20m_tags_schema.json +movielens_10m_movies_schema = /Users/tiffanyj/datasets/movielens/movielens_20m_movies_schema.json \ No newline at end of file diff --git a/hermes/hermes.py b/hermes/hermes.py index 959aae6..1b2e840 100644 --- a/hermes/hermes.py +++ b/hermes/hermes.py @@ -1,39 +1,43 @@ """Helper functions to hermesctl.py""" import json +import logging import os -from pyspark.sql.types import StructType -import hermes_ui +import hermesui import metrics.performance_metrics import modules.datum +import modules.metricgenerator as metricgenerator +import modules.recommendergenerator as recommendergenerator import modules.timer +import modules.vectorgenerator as vectorgenerator # TODO: empty certain items in cargo after no longer needed? # TODO: when to use error_state? do try-catch for all states? +# get logger +logger = logging.getLogger("hermes") + def start_state(cargo): """Start of the state machine. Create HDFS directory and upload the input data. Returns: json_to_rdd_state as next state """ - if cargo.verbose: cargo.logger.debug("In start_state:") - - if (len(cargo.json_paths) != len(cargo.schema_paths)) and (len(cargos.schema_paths) > 0): - cargo.error_msg = "Each JSON file does not have its respective schema file." - newState = error_state - return newstate, cargo + if cargo.verbose: logger.debug("In start_state:") - if cargo.verbose: cargo.logger.debug("Creating the hdfs directory " + cargo.hdfs_dir) + if cargo.verbose: logger.debug("Creating the hdfs directory " + cargo.hdfs_dir) os.system("hdfs dfs -mkdir " + cargo.hdfs_dir) - for i in range(0, len(cargo.json_paths)): - json_path = cargo.json_paths[i] - if cargo.verbose: cargo.logger.debug("Loading JSON file " + json_path + " into hdfs directory " + cargo.hdfs_dir) - os.system("hdfs dfs -put " + json_path + " " + cargo.hdfs_dir + "/" + os.path.basename(json_path)) + def load_json_files(datas): + for i in range(0, len(datas)): + json_path = datas[i].datapath + if cargo.verbose: logger.debug("Loading JSON file " + json_path + " into hdfs directory " + cargo.hdfs_dir) + os.system("hdfs dfs -put " + json_path + " " + cargo.hdfs_dir + "/" + os.path.basename(json_path)) + + load_json_files(cargo.datas) newState = json_to_rdd_state - if cargo.verbose: cargo.logger.debug("start_state -> json_to_rdd_state") + if cargo.verbose: logger.debug("start_state -> json_to_rdd_state") return newState, cargo @@ -42,56 +46,28 @@ def json_to_rdd_state(cargo): Returns: split_data_state as next state """ - if cargo.verbose: cargo.logger.debug("In json_to_rdd_state:") - - num_json_files = len(cargo.json_paths) - num_schema_files = len(cargo.schema_paths) - - # load schema files - schemas = [] - for i in range(0, num_schema_files): - schema_path = cargo.schema_paths[i] - if not schema_path: - # no schema for its respective json file - schemas.append(None) - else: - if cargo.verbose: cargo.logger.debug("Loading schema file %s" % schema_path) - with open(schema_path, "r") as schema_file: - schema = StructType.fromJson(json.load(schema_file)) - schemas.append(schema) - - # create RDD for each JSON file and store it in a Datum object - datums = [] - for i in range(0, num_json_files): - json_path = cargo.json_paths[i] - schema_path = cargo.schema_paths[i] - try: - schema = schemas[i] - except IndexError: - schema = None - - if cargo.verbose: cargo.logger.debug("Creating dataframe based on the content of the json file %s" % json_path) - dataframe = cargo.scsingleton.sqlCtx.read.json("hdfs://" + cargo.fs_default_ip_addr + "/" + cargo.hdfs_dir + "/" + os.path.basename(json_path), schema=schema) - # explicitly repartition RDD after loading so that more tasks can run on it in parallel - # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster - dataframe = dataframe.repartition(cargo.scsingleton.sc.defaultParallelism * 3) - - if schema is None: - schema = dataframe.schema - - rdd_format = hermes_ui._ask_user_for_rdd_format(schema_path, schema.names) - - if cargo.verbose: cargo.logger.debug("Creating RDD based on the format given by the user for json file %s" % json_path) - rdd = dataframe.map(lambda row: tuple(row[i] for i in rdd_format)).cache() - - if cargo.verbose: cargo.logger.debug("Storing RDD in Datum object for json file %s" % json_path) - datum = modules.datum.Datum(json_path, rdd) - datums.append(datum) - - cargo.datums = datums + if cargo.verbose: logger.debug("In json_to_rdd_state:") + + # create RDD for each JSON file and store it in Cargo's vectors list + for i in range(0, len(cargo.datas)): + data = cargo.datas[i] + if cargo.verbose: logger.debug("Working with json file %s" % data.datapath) + + if cargo.verbose: logger.debug("Creating dataframe based on the content of the json file") + datapath_in_hdfs = "hdfs://" + cargo.fs_default_ip_addr + "/" + cargo.hdfs_dir + "/" + os.path.basename(data.datapath) + data.set_dataframe(cargo.scsingleton, datapath_in_hdfs) + + if cargo.verbose: logger.debug("Creating RDD based on the computed dataframe and configuration provided by the user") + cargo.vectors.append( vectorgenerator.VectorFactory().create_obj_vector(cargo.scsingleton.sqlCtx, data, cargo.support_files) ) + + + # TODO: clean cargo? + # cargo.datas = [] + # cargo.hdfs_dir = None + # cargo.fs_default_ip_addr = None newState = split_data_state - if cargo.verbose: cargo.logger.debug("json_to_rdd_state -> split_data_state") + if cargo.verbose: logger.debug("json_to_rdd_state -> split_data_state") return newState, cargo @@ -100,36 +76,39 @@ def split_data_state(cargo): Returns: next state dependent whether or not it is using collaborative filtering or content based """ - if cargo.verbose: cargo.logger.debug("In split_data_state:") + if cargo.verbose: logger.debug("In split_data_state:") - for i in range(0, len(cargo.datums)): - datum = cargo.datums[i] - weights, seed = hermes_ui._ask_user_for_split_percentage(datum.json_path) - datum.split_data(weights, seed) + for i in range(0, len(cargo.vectors)): + vector = cargo.vectors[i] + weights, seed = hermesui._ask_user_for_split_percentage(vector.data.datapath) + vector.split_data(weights, seed) - newState = develop_model_state - if cargo.verbose: cargo.logger.debug("split_data_state -> develop_model_state") + newState = make_prediction_state + if cargo.verbose: logger.debug("split_data_state -> make_prediction_state") return newState, cargo -def develop_model_state(cargo): - """Develop model based on the train data. This model will be used to predict test data. +def make_prediction_state(cargo): + """Develop model based on the train data and make prediction based on this model. Returns: calculate_metrics_state as next state """ - if cargo.verbose: cargo.logger.debug("In develop_model_state:") - - for i in range(0, len(cargo.datums)): - datum = cargo.datums[i] - with modules.timer.Timer() as t: - # TODO: build model, please do not hardcode what to use for model - from pyspark.mllib.recommendation import ALS - cargo.model = ALS.train(datum.trainingRdd, rank=3) - if cargo.verbose: cargo.logger.debug("Building model takes %s seconds" % t.secs) + if cargo.verbose: logger.debug("In make_prediction_state:") + for i in range(0, len(cargo.vectors)): + for r in cargo.recommenders: + # TODO: implement other implementations, ie. WithTfidf(), etc. + # default is WithoutTfidf() + recommender = recommendergenerator.RecommenderFactory().create_obj_recommender(r, cargo.vectors[i]) + # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithTfidf()) + # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithoutTfidf()) + # etc. + with modules.timer.Timer() as t: + cargo.vectors[i].prediction_vector = recommender.make_prediction() + if cargo.verbose: logger.debug("Making prediction takes %s seconds" % t.secs) newState = calculate_metrics_state - if cargo.verbose: cargo.logger.debug("develop_model_state -> calculate_metrics_state") + if cargo.verbose: logger.debug("make_prediction_state -> calculate_metrics_state") return newState, cargo @@ -138,21 +117,26 @@ def calculate_metrics_state(cargo): Returns: None because this is the last state. """ - if cargo.verbose: cargo.logger.debug("In calculate_metrics_state:") - - for i in range(0, len(cargo.datums)): - datum = cargo.datums[i] - with modules.timer.Timer() as t: - # TODO: make a prediction, please do not hardcode what to do here - testPredRDD = cargo.model.predictAll( datum.testRdd.map( lambda x: (x[0], x[1]) ) ).cache() - if cargo.verbose: cargo.logger.debug("Making prediction takes %s seconds" % t.secs) - with modules.timer.Timer() as t: - # TODO: calculate metric, please do not hardcode what to use for metric - testRmse = metrics.performance_metrics.calculate_rmse_using_rdd(datum.testRdd, testPredRDD) - if cargo.verbose: cargo.logger.debug("Calculating metric takes %s seconds" % t.secs) - print "testRmse", testRmse - - if cargo.verbose: cargo.logger.debug("calculate_metrics_state -> end_state") + if cargo.verbose: logger.debug("In calculate_metrics_state:") + + # create a metric executor + executor = metricgenerator.MetricExecutor(metricgenerator.Metric()) + + # TODO: figure out why logger prints INFO twice + for i in range(0, len(cargo.vectors)): + logger.info("-" * 80) + logger.info("Data: %s" % cargo.vectors[i].data.datapath) + for m in cargo.metrics: + # check if metric exists + metric = metricgenerator.MetricFactory().create_obj_metric(m) + # set metric in executor + executor.change_metric(metric) + # execute the metric + with modules.timer.Timer() as t: + logger.info("Metric: %s = %f" % (m, executor.execute(cargo.vectors[i]))) + if cargo.verbose: logger.debug("Calculating metric takes %s seconds" % t.secs) + logger.info("-" * 80) + if cargo.verbose: logger.debug("calculate_metrics_state -> end_state") return @@ -160,8 +144,8 @@ def error_state(cargo): """Error state. Print out the error messages. This is an end state. Returns: None because this is the last state. """ - if cargo.verbose: cargo.logger.debug("In error_state:") - cargo.logger.error("ERROR: " + cargo.error_msg) - if cargo.verbose: cargo.logger.debug("error_state -> end_state") + if cargo.verbose: logger.debug("In error_state:") + logger.error("ERROR: " + cargo.error_msg) + if cargo.verbose: logger.debug("error_state -> end_state") return diff --git a/hermes/hermesctl.py b/hermes/hermesctl.py index 753f46b..cb41a66 100644 --- a/hermes/hermesctl.py +++ b/hermes/hermesctl.py @@ -1,10 +1,18 @@ """Script to run hermes via command line.""" import click +import ConfigParser +import itertools +import json +import logging import sys +from pyspark import SparkConf import hermes +import modules.config as Config +from modules.data import UserVectorData, ContentVectorData from modules.cargo import Cargo +from modules.singleton import SCSingleton from modules.statemachine import StateMachine @@ -15,20 +23,19 @@ def add_states(stateMachine): stateMachine.add_state(hermes.start_state) stateMachine.add_state(hermes.json_to_rdd_state) stateMachine.add_state(hermes.split_data_state) - stateMachine.add_state(hermes.develop_model_state) + stateMachine.add_state(hermes.make_prediction_state) stateMachine.add_state(hermes.calculate_metrics_state, isEndState=1) stateMachine.add_state(hermes.error_state, isEndState=1) stateMachine.set_start(hermes.start_state) return -def create_logger(): - import logging - logger = logging.getLogger(__name__) +def create_logger(name): + logger = logging.getLogger(name) logger.setLevel(logging.DEBUG) - # create file handler which logs even debug messages + # create hermes.log file that prints out debug messages fh = logging.FileHandler("hermes.log") fh.setLevel(logging.DEBUG) - # create console handler for stderr with a higher log level + # create console handler for stderr that prints out error messages che = logging.StreamHandler() che.setLevel(logging.ERROR) # create console handler for stdout for info, debug, and error level @@ -54,13 +61,143 @@ def create_logger(): return logger def create_sparkcontext(): - from pyspark import SparkConf - from modules.singleton import SCSingleton conf = SparkConf().setAppName("hermes") return SCSingleton(conf) -def extract_paths(file_with_paths): - return [line.rstrip("\n") for line in open(file_with_paths)] +def extract_configs(configs_path, list_of_files_config_path, cargo, logger): + # TODO: is there a better way to implement this function? + + # extract list_of_files_config + lofcp = ConfigParser.ConfigParser() + lofcp.read(list_of_files_config_path) + + # helper functions for extracting configs + def handle_recognized_section_item(section, item_key, item_value): + if section == "datasets": + datasets_items[item_key] = item_value + return + if section == "recommenders": + if item_key == "recommenders": + cargo.recommenders.extend( json.loads(item_value) ) + return + if section == "metrics": + if item_key == "metrics": + cargo.metrics.extend( json.loads(item_value) ) + + def handle_unrecognized_section_item(section, item_key, item_value): + if section == "datasets": + # add support file + cargo.support_files[item_key] = item_value + return + if section == "recommenders": + logger.error("ERROR: skip unrecognized item " + item_key + " under section [" + section + "] in config" + config_path) + return + if section == "metrics": + logger.error("ERROR: skip unrecognized item " + item_key + " under section [" + section + "] in config" + config_path) + return + + def handle_dataset_section(dataset_items, config_path): + # make sure vectorizer is initialized in order to verify the section in list_of_files_config + # TODO: which is better? iterating through sections then items or iterating through just items of list_of_files_config? + + if not ("vectorizer" in datasets_items.keys()): + logger.error("ERROR: config " + config_path + " must have vectorizer specified.") + sys.exit() + + vectorizer = datasets_items["vectorizer"] + lofmap = Config.map_section(lofcp, vectorizer) + + # create UserVectorData or ContentVectorData or both + hasUserVector = False + # check it has the required items to build a UserVectorData + if set(Config.REQ_UV_HEADINGS) < set(datasets_items.keys()): + hasUserVector = True + create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector=True) + + hasContentVector = False + # check it has the required items to build a ContentVectorData + if set(Config.REQ_CV_HEADINGS) < set(datasets_items.keys()): + hasContentVector = True + create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector=False) + + if not hasUserVector and not hasContentVector: + logger.error("ERROR: config " + config_path + " does not have declaration for a user vector or a content vector") + sys.exit() + + def create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector): + """ + user_vector_data = movielens_10m_ratings, bleh_ratings + user_vector_schemas = movielens_10m_ratings_schema, bleh_schema + user_vector_types = ratings, bleh + + """ + + if isUserVector: + datapaths_heading = "user_vector_data" + vector_types_heading = "user_vector_types" + schemapaths_heading = "user_vector_schemas" + else: + datapaths_heading = "content_vector_data" + vector_types_heading = "content_vector_types" + schemapaths_heading = "content_vector_schemas" + + datapaths = json.loads(datasets_items[datapaths_heading]) + vector_types = json.loads(datasets_items[vector_types_heading]) + hasSchemas = False + if "user_vector_schemas" in datasets_items.keys(): + schemapaths = json.loads(datasets_items[schemapaths_heading]) + hasSchemas = True + + # check that a vector type is specified for each data + # TODO: multiple vector types for each data in the future? + if len(datapaths) != len(vector_types): + logger.error("ERROR: must specify a vector type for each data in config " + config_path) + sys.exit() + + for i in range(0, len(datapaths)): + # set datapath + try: + datapath = lofmap[datapaths[i]] + except KeyError: + logger.error("ERROR: cannot find data " + datapath + " in the list_of_files_config for config " + config_path) + sys.exit() + # set vector_type + vector_type = vector_types[i] + # set schemapath + try: + if hasSchemas: schemapath = lofmap[schemapaths[i]] + except IndexError, KeyError: + schemapath = None + + if isUserVector: + uservectordata = UserVectorData(datapath, vector_type, schemapath, vectorizer) + cargo.datas.append(uservectordata) + else: + contentvectordata = ContentVectorData(datapath, vector_type, schemapath, vectorizer) + cargo.datas.append(contentvectordata) + + # extract configs + for config_path in configs_path: + cp = ConfigParser.ConfigParser() + cp.read(config_path) + datasets_items = {} + # extract sections + for section in cp.sections(): + if section in Config.HEADINGS.keys(): + # extract section's items + for (item_key, item_value) in cp.items(section): + if item_key in Config.HEADINGS.get(section): + handle_recognized_section_item(section, item_key, item_value) + else: + handle_unrecognized_section_item(section, item_key, item_value) + # end extract item + else: + logger.error("ERROR: skip unrecognized section heading [" + section + "] in config " + config_path) + # handle "datasets" section + if section == "datasets": + handle_dataset_section(datasets_items, config_path) + # end extract sections + # end extract configs def print_version(ctx, param, value): """Print the current version of hermes and exit.""" @@ -75,6 +212,7 @@ def print_version(ctx, param, value): click.echo(version) ctx.exit() +# TODO: add option to print what recommenders @click.command() @click.option("--verbose", is_flag=True, \ help="Print debug messages") @@ -84,13 +222,14 @@ def print_version(ctx, param, value): help="Name of HDFS directory to store input data.") # IP address of fs.default.name used in HDFS @click.argument("fs_default_ip_addr", default="localhost:9000") -# Path to a file that lists JSON files. -@click.argument("file_to_json_paths", type=click.Path(exists=True), nargs=1) -@click.option("--schemas", type=click.Path(exists=True), nargs=1, \ - help="Path to a file that lists each JSON file's schema.") -def main(verbose, hdfs_dir, fs_default_ip_addr, file_to_json_paths, schemas): +@click.argument("list_of_files_config", type=click.Path(exists=True), nargs=1) +@click.argument("configs", type=click.Path(exists=True), nargs=-1) +def main(verbose, hdfs_dir, fs_default_ip_addr, list_of_files_config, configs): """Hermes allows you to run multiple recommender system metrics on your chosen dataset.""" + # create logger + logger = create_logger("hermes") + # create state machine stateMachine = StateMachine() add_states(stateMachine) @@ -100,12 +239,12 @@ def main(verbose, hdfs_dir, fs_default_ip_addr, file_to_json_paths, schemas): # add items to cargo cargo.scsingleton = create_sparkcontext() - cargo.logger = create_logger() cargo.verbose = verbose cargo.hdfs_dir = hdfs_dir cargo.fs_default_ip_addr = fs_default_ip_addr - cargo.json_paths = extract_paths(file_to_json_paths) - cargo.schema_paths = extract_paths(schemas) + + # extract configs and add them to cargo + extract_configs(configs, list_of_files_config, cargo, logger) # run state machine stateMachine.run(cargo) diff --git a/hermes/hermes_ui.py b/hermes/hermesui.py similarity index 96% rename from hermes/hermes_ui.py rename to hermes/hermesui.py index aea8d41..89c372b 100644 --- a/hermes/hermes_ui.py +++ b/hermes/hermesui.py @@ -55,7 +55,7 @@ def _ask_user_for_split_percentage(datum_json_path): """ print "How do you want to split your data?" print "For example: If you wanted to split the data into " - print "60\% training, 40\% test, 0\% validation, seed = 11, please type in:" + print "60% training, 40% test, 0% validation, seed = 11, please type in:" print "Percentage for training: 60" print "Percentage for test: 40" print "Percentage for validation: 0" @@ -94,7 +94,7 @@ def _check_sum_percentage(a, b, c): print "Please provide a number from 0 - 100." while True: try: - testPercentage = int(raw_input("Precentage for test: ").strip()) + testPercentage = int(raw_input("Percentage for test: ").strip()) except ValueError: print "Please provide a valid number." else: diff --git a/hermes/metrics/performance_metrics.py b/hermes/metrics/performance_metrics.py index 5a347a6..934df79 100644 --- a/hermes/metrics/performance_metrics.py +++ b/hermes/metrics/performance_metrics.py @@ -54,7 +54,7 @@ def get_perform_metrics(y_test, y_train, y_predicted, content_array, sqlCtx, num # RMSE ----------------------------------------------------------------- -def calculate_rmse_using_rdd(y_actual, y_predicted): +def calculate_rmse(y_actual, y_predicted): """ Determines the Root Mean Square Error of the predictions. @@ -73,26 +73,11 @@ def calculate_rmse_using_rdd(y_actual, y_predicted): sum_ratings_diff_sq = ratings_diff_sq.reduce(add) num = ratings_diff_sq.count() - return sqrt(sum_ratings_diff_sq / float(num) ) - -def calculate_rmse_using_array(y_actual, y_predicted): - """ - Determines the Root Mean Square Error of the predictions. - - Args: - y_actual: actual ratings in the format of an array of [ (userId, itemId, actualRating) ] - y_predicted: predicted ratings in the format of an array of [ (userId, itemId, predictedRating) ] - - Assumptions: - y_actual and y_predicted are in the same order. - - """ - return sqrt(mean_squared_error(y_actual, y_predicted)) - #return mean_squared_error(y_actual, y_predicted) ** 0.5 + return sqrt(sum_ratings_diff_sq / float(num)) # MAE ------------------------------------------------------------------ -def calculate_mae_using_rdd(y_actual, y_predicted): +def calculate_mae(y_actual, y_predicted): """ Determines the Mean Absolute Error of the predictions. @@ -699,4 +684,4 @@ def calc_relevant_rank_stats(y_actual, y_predicted, sqlCtx): rank_stats = np.mean(max_ranks_local, axis=0) - return rank_stats \ No newline at end of file + return rank_stats diff --git a/hermes/modules/cargo.py b/hermes/modules/cargo.py index 456b2cf..b07171d 100644 --- a/hermes/modules/cargo.py +++ b/hermes/modules/cargo.py @@ -3,24 +3,21 @@ class Cargo(object): Args: scsingleton: Spark Context. There can only be one scsingleton running. - logger: - verbose: + verbose: a boolean variable that prints out log messages hdfs_dir: fs_default_ip_addr: - json_paths: - schema_path: - schema: error_msg: """ + # TODO: implement cargo as object pool model? def __init__(self): self.scsingleton = None - self.logger = None self.verbose = False self.hdfs_dir = None self.fs_default_ip_addr = None - self.json_paths = [] - self.schema_paths = [] - self.datums = [] - self.model = None self.error_msg = "" + self.datas = [] # used until json_to_rdd_state + self.vectors = [] # used until develop_model_state + self.support_files = {} + self.recommenders = [] + self.metrics = [] diff --git a/hermes/modules/config.py b/hermes/modules/config.py new file mode 100644 index 0000000..2d34b39 --- /dev/null +++ b/hermes/modules/config.py @@ -0,0 +1,31 @@ +import logging + +REQ_UV_HEADINGS = ("user_vector_data", "user_vector_types") +UV_HEADINGS = () + REQ_UV_HEADINGS + ("user_vector_schemas",) + +REQ_CV_HEADINGS = ("content_vector_data", "content_vector_types") +CV_HEADINGS = () + REQ_CV_HEADINGS + ("content_vector_schemas",) + +DATASETS_HEADINGS = ("vectorizer",) + UV_HEADINGS + CV_HEADINGS + +HEADINGS = { "datasets": DATASETS_HEADINGS, \ + "recommenders": ("recommenders"), \ + "metrics": ("metrics") \ + } + +# get logger +logger = logging.getLogger("hermes") + +def map_section(config_parser, section): + global logger + section_dict = {} + options = config_parser.options(section) + for option in options: + try: + section_dict[option] = config_parser.get(section, option) + if section_dict[option] == -1: + logger.debug(__name__ + ": map_section(): skipping option " + option) + except: + logger.error(__name__ + ": map_section(): exception on option " + option) + section_dict[option] = None + return section_dict diff --git a/hermes/modules/data.py b/hermes/modules/data.py new file mode 100644 index 0000000..a504fff --- /dev/null +++ b/hermes/modules/data.py @@ -0,0 +1,38 @@ +import helper +import vectorgenerator + +class Data(object): + + def __init__(self, datapath, vector_type, schemapath, vectorizer): + if helper.is_filepath_valid(datapath): + self.datapath = datapath + self.vector_type = vector_type + self.schema = helper.get_schema(schemapath) + self.dataframe = None + self.vectorizer = vectorizer + # TODO: do we need to know from which config the data is from? + + def set_dataframe(self, scsingleton, datapath_in_hdfs): + self.dataframe = scsingleton.sqlCtx.read.json(datapath_in_hdfs, self.schema) + # explicitly repartition RDD after loading so that more tasks can run on it in parallel + # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster + self.dataframe = self.dataframe.repartition(scsingleton.sc.defaultParallelism * 3) + + # set schema if it is not already set + if self.schema is None: + self.schema = self.dataframe.schema + +class UserVectorData(Data): + def __init__(self, datapath, vector_type, schemapath, vectorizer): + super(self.__class__, self).__init__(datapath, vector_type, schemapath, vectorizer) + self.which_vector = vectorgenerator.UserVector + +class ContentVectorData(Data): + def __init__(self, datapath, vector_type, schemapath, vectorizer): + super(self.__class__, self).__init__(datapath, vector_type, schemapath, vectorizer) + self.which_vector = vectorgenerator.ContentVector + + + + + diff --git a/hermes/modules/datum.py b/hermes/modules/datum.py deleted file mode 100644 index 8cfb8f0..0000000 --- a/hermes/modules/datum.py +++ /dev/null @@ -1,22 +0,0 @@ - -class Datum(object): - """Datum is a single data being subjected to - recommender system algorithms and performance metrics. - """ - - def __init__(self, json_path, rdd): - self.json_path = json_path - self.rdd = rdd - self.trainingRdd = None - self.testRdd = None - self.validationRdd = None - - def split_data(self, weights, seed): - trainingRdd, testRdd, validationRdd = self.rdd.randomSplit(weights, seed) - self.trainingRdd = trainingRdd.cache() - self.testRdd = testRdd.cache() - self.validationRdd = validationRdd.cache() - - - - diff --git a/hermes/modules/helper.py b/hermes/modules/helper.py new file mode 100644 index 0000000..c3aa2ec --- /dev/null +++ b/hermes/modules/helper.py @@ -0,0 +1,13 @@ +import os +import json +from pyspark.sql.types import StructType + + +def is_filepath_valid(filepath): + return True if os.path.isfile(filepath) else False + +def get_schema(schema_path): + if not schema_path: + return None + with open(schema_path, "r") as schema_file: + return StructType.fromJson(json.load(schema_file)) diff --git a/hermes/modules/metricgenerator.py b/hermes/modules/metricgenerator.py new file mode 100644 index 0000000..37105e7 --- /dev/null +++ b/hermes/modules/metricgenerator.py @@ -0,0 +1,61 @@ + +import os +import sys +sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/..")) +import metrics.performance_metrics as pm + +""" + +eggsecutor = MetricExecutor(RMSE()) +print eggsecutor.execute(vector) +eggsecutor.changeAlgorithm(PRFS()) +print eggsecutor.execute(vector) + +""" + +# ================================================================================ +# Metric Strategy +# ================================================================================ + +class MetricExecutor: + def __init__(self, metric): + self.metric = metric + + def execute(self, vector): + return self.metric.calculate_metric(vector) + + def change_metric(self, new_metric): + self.metric = new_metric + +# ================================================================================ +# List of metrics +# ================================================================================ + +class MetricFactory(object): + def create_obj_metric(self, metric_str): + which_metric = getattr(sys.modules[__name__], metric_str) + if not which_metric: + # cannot find class + raise ValueError + else: + return which_metric() + +class Metric: + def calculate_metric(self, vector=None) : + pass + +class RMSE(Metric): + def calculate_metric(self, vector): + return pm.calculate_rmse(vector.test_vector, vector.prediction_vector) + +class MAE(Metric): + def calculate_metric(self, vector): + return pm.calculate_mae(vector.test_vector, vector.prediction_vector) + +class PRFS(Metric): + def calculate_metric(self): + pass + + + + diff --git a/hermes/modules/recommendergenerator.py b/hermes/modules/recommendergenerator.py new file mode 100644 index 0000000..a086058 --- /dev/null +++ b/hermes/modules/recommendergenerator.py @@ -0,0 +1,121 @@ +""" + +with_tfidf = WithTfidf() +without_tfidf = WithoutTfidf() + +recommender = ALS(with_tfidf) +recommender.make_prediction() + +recommender = ALS(without_tfdif) +recommender.make_prediction() + +recommender = CBWithKMeans(with_tfidf) +recommender.make_prediction() + +recommender = CBWithKMeans(without_tfidf) +recommender.make_prediction() + +""" + +import logging +import sys +import timer +import pyspark.mllib.recommendation as mllib + + +# get logger +logger = logging.getLogger("hermes") + +# ================================================================================ +# Background implementation interface +# ================================================================================ + +class ImplementationInterface(object): + def make_prediction_with_als(self): + raise NotImplemented + + def make_prediction_with_cbkmeans(self): + raise NotImplemented + + +# ================================================================================ +# Concrete background implementations +# ================================================================================ + +# TODO: Interface is not necessary. +# Should we remove ImplementationInterface? Or keep it for design sake? +class WithTfidf(ImplementationInterface): + def make_prediction_with_als(self, vector): + # create ALS model with tf-idf + pass + + def make_prediction_with_cbkmeans(self, vector): + # create CB with K-means with tf-idf + pass + +class WithoutTfidf(ImplementationInterface): + def make_prediction_with_als(self, vector): + # create ALS model without tf-idf + # TODO: specify rank based on what the user wants + model = mllib.ALS.train(vector.training_vector, rank=3) + prediction_vector = model.predictAll( vector.test_vector.map( lambda x: (x[0], x[1]) ) ).cache() + return prediction_vector + + + def make_prediction_with_cbkmeans(self, vector): + # create CB with K-means without tf-idf + pass + +# ================================================================================ +# Target Interface +# ================================================================================ + +class AbstractInterface(object): + def make_prediction(self): + raise NotImplemented + +# ================================================================================ +# Bridge: bridge target interface & background implementation +# ================================================================================ + +# TODO: Interface is not necessary. +# Should we remove ImplementationInterface? Or keep it for design sake? +class Recommender(AbstractInterface): + def __init__(self, vector): + self.vector = vector + self.implementation = None + +# ================================================================================ +# Recommender Factory +# ================================================================================ + +class RecommenderFactory(object): + def create_obj_recommender(self, recommender_str, vector, implementation=WithoutTfidf()): + which_recommender = getattr(sys.modules[__name__], recommender_str) + if not which_recommender: + # cannot find class + raise ValueError + else: + return which_recommender(vector, implementation) + + +# ================================================================================ +# Variant of target interface +# ================================================================================ + +class ALS(Recommender): + def __init__(self, vector, implementation=WithoutTfidf()): + self.vector = vector + self.implementation = implementation + + def make_prediction(self): + return self.implementation.make_prediction_with_als(self.vector) + +class CBWithKMeans(Recommender): + def __init__(self, vector, implementation=WithoutTfidf()): + self.vector = vector + self.implementation = implementation + + def make_prediction(self): + return self.implementation.make_prediction_with_cbkmeans(self.vector) + diff --git a/hermes/modules/vectorgenerator.py b/hermes/modules/vectorgenerator.py new file mode 100644 index 0000000..ea09b7a --- /dev/null +++ b/hermes/modules/vectorgenerator.py @@ -0,0 +1,147 @@ +import data + +# vector generator == rdd generator + +# ================================================================================ +# Vector Factory +# ================================================================================ + +class VectorFactory(object): + + def create_vector(self, sqlCtx, data, support_files): + vector = data.which_vector + for cls in vector.__subclasses__(): + if cls.isSameDataInstance(data): + return cls(sqlCtx, data, support_files).vector + else: + # cannot find class that builds the data + raise ValueError + + def create_obj_vector(self, sqlCtx, data, support_files): + vector = data.which_vector + for cls in vector.__subclasses__(): + if cls.isSameDataInstance(data): + return cls(sqlCtx, data, support_files) + else: + # cannot find class that builds the data + raise ValueError + +# ================================================================================ +# Vector Factory Objects +# ================================================================================ + +class Vector(object): + def __init__(self, sqlCtx, data, support_files): + self.sqlCtx = sqlCtx + self.data = data + self.support_files = support_files + get_vector_type = getattr(self, data.vector_type) + if not get_vector_type: + self.vector = None + else: + self.vector = get_vector_type() + self.training_vector = None + self.test_vector = None + self.validation_vector = None + self.prediction_vector = None + + def split_data(self, weights, seed): + training_vector, test_vector, validation_vector = self.vector.randomSplit(weights, seed) + self.training_vector = training_vector + self.test_vector = test_vector + self.validation_vector = validation_vector + +# ================================================================================ +# User Vector and Content Vector Factory Objects +# ================================================================================ + +class UserVector(Vector): + pass + +class ContentVector(Vector): + pass + +# ================================================================================ +# MovieLens +# ================================================================================ + +class MovieLens(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.vectorizer == "movielens" + +class MovieLensUserVector(UserVector, MovieLens): + def ratings(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) + + def pos_ratings(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)).filter(lambda (u, m, r): r > 3) + + def ratings_to_interact(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, -1 if row.rating < 3 else 1)) + +class MovieLensContentVector(ContentVector, MovieLens): + def genre(self): + def genre_vectorizer(row): + return np.array(( + int(row.genre_action), + int(row.genre_adventure), + int(row.genre_animation), + int(row.genre_childrens), + int(row.genre_comedy), + int(row.genre_crime), + int(row.genre_documentary), + int(row.genre_drama), + int(row.genre_fantasy), + int(row.genre_filmnoir), + int(row.genre_horror), + int(row.genre_musical), + int(row.genre_mystery), + int(row.genre_romance), + int(row.genre_scifi), + int(row.genre_thriller), + int(row.genre_war), + int(row.genre_western), + )) + return self.data.dataframe.map(lambda row: (row.movie_id, )) + +# ================================================================================ +# Wiki +# ================================================================================ + +class Wiki(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.vectorizer == "wiki" + +class WikiUserVector(UserVector, Wiki): + def __init__(self): + super(self.__class__, self).__init__() + self.filtered = self.sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") + self.filtered.registerTempTable("wiki_ratings") + + def num_edits(self): + return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id") + + def any_interact(self): + return self.sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings group by user_id, article_id") + + def num_edits_ceil(self): + return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki group by user_id, article_id")\ + .map(lambda (user, article, rating): (user, article, max(rating, 5))) + +class WikiContentVector(ContentVector, Wiki): + def __init__(self): + super(self.__class__, self).__init__() + self.filtered_content = sqlCtx.sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") + self.filtered_content.registerTempTable("wiki_content") + + def glove(self): + pass + + def category_map(self): + pass + +# ================================================================================ +# ADD ADDITIONAL UserVector and ContentVector based on a given data +# ================================================================================ From e2ffd6b9406b537c18bf69029d37ad54c82b0fb2 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Thu, 17 Dec 2015 15:29:53 -0800 Subject: [PATCH 12/39] complete READMEs wip: working on READMEs wip: working on configs.md README wip: convert tabs to spaces wip: complete configs.md readme and rename vector_type to vector_transformation we want to avoid confusion between vector_type that does vector transformations and vector type that differentiates between User Vector and content Vector wip: complete data_supported.md and add table of contents for all. also, fix wiki's vector trasnformation user_edits_ceil to read from wiki_ratings table. wip: cleanup spelling errors in data_supported.md wip: fix sp err in readmes, remove unnecessary interface in recommendergenerator.py, fix examples in metricgenerator.py wip: fix glossary format wip: putting the skeleton for framework.md wip: framework.md, use boolean for isEndState wip: framework.md wip: add comment and framework.md wip: framework.md wip: framework.md wip: framework.md, add more comments wip: framework.md and add more comments in modules wip: framework.md wip: framework.md for recommendergenerator.py wip: all docs complete! :) --- README.md | 22 +- docs/configs.md | 246 ++++++++ docs/data_supported.md | 164 +++++ docs/framework.md | 560 ++++++++++++++++++ docs/glossary.md | 82 +++ docs/installation.md | 56 +- docs/metrics_supported.md | 51 ++ docs/recommenders_supported.md | 117 ++++ docs/run.md | 49 ++ hermes/__init__.py | 1 - .../{my_config.ini => movielens_config.ini} | 5 +- .../configs/{config_1.ini => wiki_config.ini} | 4 +- .../data_prep/movieLens_vectorize.py | 0 {src => hermes}/data_prep/osm_vectoize.py | 0 {src => hermes}/data_prep/wiki_vectorize.py | 0 hermes/hermes.py | 213 ++++--- hermes/hermesctl.py | 498 +++++++++------- hermes/hermesui.py | 260 ++++---- hermes/modules/cargo.py | 45 +- hermes/modules/config.py | 50 +- hermes/modules/data.py | 55 +- hermes/modules/globals.py | 11 + hermes/modules/helper.py | 12 +- hermes/modules/metricgenerator.py | 26 +- hermes/modules/recommendergenerator.py | 99 ++-- hermes/modules/singleton.py | 2 + hermes/modules/statemachine.py | 91 +-- hermes/modules/timer.py | 16 +- hermes/modules/vectorgenerator.py | 190 +++--- {src => hermes}/utils/save_load.py | 0 scripts/create_file_containing_paths.py | 81 --- 31 files changed, 2161 insertions(+), 845 deletions(-) create mode 100644 docs/configs.md create mode 100644 docs/data_supported.md create mode 100644 docs/framework.md create mode 100644 docs/glossary.md create mode 100644 docs/metrics_supported.md create mode 100644 docs/recommenders_supported.md create mode 100644 docs/run.md rename hermes/configs/{my_config.ini => movielens_config.ini} (81%) rename hermes/configs/{config_1.ini => wiki_config.ini} (79%) rename {src => hermes}/data_prep/movieLens_vectorize.py (100%) rename {src => hermes}/data_prep/osm_vectoize.py (100%) rename {src => hermes}/data_prep/wiki_vectorize.py (100%) create mode 100644 hermes/modules/globals.py rename {src => hermes}/utils/save_load.py (100%) delete mode 100644 scripts/create_file_containing_paths.py diff --git a/README.md b/README.md index 80f6c10..591e539 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ This readme will be updated as the project progresses so stay tuned! ## Basic Installation Guide -For a detailed installation guide, please read on [Hermes Installation Guide](https://github.com/Lab41/hermes/tree/master/docs/installation.txt). +For a detailed installation guide, please read on [Hermes Installation Guide](https://github.com/Lab41/hermes/tree/master/docs/installation.md). ### Dependencies: * Spark 1.5.1 @@ -47,6 +47,26 @@ Now, you can just run hermes the binary and it will prompt you with what you wan $ hermes ``` +## How to Run Hermes + +NOTE: Next implementation of Hermes will be set up so that it does not use pseudo-distributed mode in a single node cluster. + +For a detailed guide on how to run Hermes, please read on [How to Run Hermes](https://github.com/Lab41/hermes/tree/master/docs/run.md) guide. + +Hermes requires at least three arguments in order to run properly. +* fs_default_ip_addr: IP address of fs.default.name used in HDFS, ie. localhost:9000. +* list_of_files_config: A configuration file that lists all the json paths referenced by configs. +* configs: Users can provide an unlimited amount of configuration files that list what datasets to use and which recommender algorithms and metrics to apply to each dataset. + +With one configuration file: +```bash +$ hermes localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini +``` + +With more than one configuration files: +```bash +$ hermes localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini ./hermes/configs/config2.ini +``` ## State of Build diff --git a/docs/configs.md b/docs/configs.md new file mode 100644 index 0000000..09e2810 --- /dev/null +++ b/docs/configs.md @@ -0,0 +1,246 @@ +# Hermes's Configuration Files Explained + +* [List of Files Standard](#list-of-files-standard) + * [Vectorizer](#vectorizer) + * [JSON Paths](#json-paths) +* [Configuration File Standard](#configuration-file-standard) + * [Datasets](#datasets) + * [Vectorizer](#vectorizer) + * [Vectors](#vectors) + * [Optional Variables: Schemas & Support Files](#optional-variables) +* [Recommenders](#recommenders) +* [Metrics](#metrics) + +Hermes requires at least two configuration files: +* **list_of_files_config**: A configuration file that lists all the json paths referenced by configs. +* **configs**: Users can provide an unlimited amount of configuration files that list what datasets to use and which recommender algorithms and metrics to apply to each dataset. + +Each configuration file requires it to follow a certain standard. These standards will be further explained below. + +Saved configuration files can be found in hermes/hermes/configs in case you want to run a previously saved configuration. + +Before continuing, it might be beneficial if you understand the Hermes's framework by reading this [guide](https://github.com/Lab41/hermes/tree/master/docs/framework.md) first. + +## List of Files Standard + +Let's take a look at an example file called list_of_files.ini. + +```bash +[movielens] +# 20M data +movielens_20m_ratings = /path/to/your/movielens/20m/ratings.json.gz +movielens_20m_tags = /path/to/your/movielens/20m/tags.json.gz +movielens_20m_movies = /path/to/your/movielens/20m/movies.json.gz + +# 10M data +movielens_10m_ratings = /path/to/your/movielens/10m/ratings.json.gz +movielens_10m_tags = /path/to/your/movielens/10m/tags.json.gz +movielens_10m_movies = /path/to/your/movielens/10m/movies.json.gz + +# 1M data +movielens_1m_ratings = /path/to/your/movielens/1m/ratings.json.gz +movielens_1m_tags = /path/to/your/movielens/1m/tags.json.gz +movielens_1m_movies = /path/to/your/movielens/1m/movies.json.gz + +# 20M schema +movielens_20m_ratings_schema = /path/to/your/movielens/20m/ratings_schema.json.gz +movielens_20m_tags_schema = /path/to/your/movielens/20m/tags_schema.json.gz +movielens_20m_movies_schema = /path/to/your/movielens/20m/movies_schema.json.gz + +# 10M schema +movielens_10m_ratings_schema = /path/to/your/movielens/10m/ratings_schema.json.gz +movielens_10m_tags_schema = /path/to/your/movielens/10m/tags_schema.json.gz +movielens_10m_movies_schema = /path/to/your/movielens/10m/movies_schema.json.gz + +# 1M schema +movielens_1m_ratings_schema = /path/to/your/movielens/1m/ratings_schema.json.gz +movielens_1m_tags_schema = /path/to/your/movielens/1m/tags.json_schema.gz +movielens_1m_movies_schema = /path/to/your/movielens/1m/movies_schema.json.gz +``` + +### Vectorizer + +A single data can be split into multiple JSON files. In this case, [movielens] is a data that is split into multiple JSON files. For lack of a better term, we call [movielens] a "vectorizer" variable. There can be multiple vectorizers in a list of files (ie. list_of_files.ini), but there can only be one vectorizer in a configuration file (ie. config.ini). + +Vectorizer plays an important role in that we know which data each JSON file is coming from. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance() for each data instantiated class. What is checked in isSameDataInstance() has to match the vectorizer exactly. If it did not, Hermes will throw an error message. + +For example, in the case of the Movie Lens data, its vectorizer is "movielens". The check in the class MovieLens's isSameDataInstance() function will check that vectorizer is equal to "movielens". If you passed [MovieLens] to list_of_files.ini, for example, and the check in isSameDataInstance() is "movielens", it will fail. However, if you passed [movielens] to list_of_files.ini and the check in isSameDataInstance() is "movielens", it will pass. + +### JSON Paths + +Underneath the vectorizer heading, each variable (ie. movielens_20m_ratings, movielens_20m_tags, etc.) is a shorthand name for a specific JSON file. These variables will store the path to their individual JSON file. They will be used in the configuration file (ie. config.ini) as input to user_vector_data and content_vector_data variable. + +## Configuration File Standard + +**If you wanted to know what data is currently supported by Hermes and the different ways you can parse the data (and how you can add your own data not yet supported), please checkout [List of Data Supported](https://github.com/Lab41/hermes/tree/master/docs/data_supported.md) guide.** + +**If you wanted to know what types of recommender system algorithms currently supported by Hermes (and how you can add different algorithms not yet supported), please check out [List of Recommender Systems Supported](https://github.com/Lab41/hermes/tree/master/docs/recommenders_supported.md) guide.** + +**If you wanted to know what types of metrics currently supported by Hermes (and how you can add different metrics not yet supported), please check out [List of Metrics Supported](https://github.com/Lab41/hermes/tree/master/docs/metrics_supported.md) guide.** + +Let's take a look at an example file called config.ini. + +```bash +[datasets] +vectorizer = movielens + +# user vector +user_vector_data = ["movielens_10m_ratings", "movielens_20m_ratings"] +user_vector_schemas = ["movielens_10m_ratings_schema", "movielens_20m_ratings_schema"] +user_vector_transformations = ["ratings", "ratings_to_interact"] + +# content vector +content_vector_data = ["movielens_10m_movies"] +content_vector_schema = ["movielens_10m_movies_schema"] +content_vector_transformations = ["genre"] + +[recommenders] +user_recommenders = ["ALS"] +content_recommenders = ["CBWithKMeans"] + +[metrics] +metrics = ["RMSE", "MAE"] +``` + +### Datasets + +Datasets specify which data we are going to use. It contains vectorizer, user or content vectors, and support files. + +#### Vectorizer + +One configuration file can specify only one vectorizer. Vectorizer is the name of the data where each JSON file is derived from. + +#### Vectors + +Vector is the transformed data that will be subjected to the recommender system algorithms and metrics. + +Understanding how a vector is created will provide an understanding of what a vector is. To create a vector, the steps are as follow: + +1. Read the configuration files to know what type of vectors we are creating. +2. Read each JSON file to obtain the data. The output of this step is the creation of a dataframe. +3. Once you have this dataframe, you can subject it to a transformation specified by the vector transformation. For example: if we wanted to create a user vector from the JSON file "movielens_10m_ratings" of vector tranformation "ratings" as specified by config.ini above, the data from the JSON file "movielens_10m_ratings" is transformed into a RDD of [(user_id, movie_id, rating)] because vector transformation "rating" converts MovieLens data into [(user_id, movie_id, rating)]. Different vector transformation will implement different transformation of the data. For vector transformation "ratings_to_interact", it will convert MovieLens data into [(user_id, movie_id, just_rating_greater_than_3)]. + +To wrap it up, vector refers to a dataframe that has been converted to a RDD after a transformation occurs. This transformation is specified by the vector tranformation. + +There are two types of vectors currently implemented: User Vector and Content Vector. User Vector refers to the vector describing users in the data. Content Vector refers to the vector describing the content in the data. + +Each vector requires the following to be specified in the configuration file: +* **user_vector_data** / **content_vector_data**: Vector data takes in a list of JSON names that reference the JSON path as specified in the list of files config (ie. list_of_files.ini). user_vector_data will create a User Vector; content_vector_data will create a Content Vector. +* **user_vector_transformations** / **content_vector_transformations**: user_vector_transformations and content_vector_transformations will take in a list of transformations to apply to user_vector_data and content_vector_data respectively. Note that user_vector_data and user_vector_transformations (as well as content_vector_data and content_vector_transformations) have a one-on-one relationship, meaning vector transformation at index 0 will be applied to vector data at index 0, vector transformation at index 1 will be applied to vector data at index 1, and vector transformation at index n will be applied to vector data at index n. Currently, Hermes does not have the ability to apply multiple transformations onto one vector data unless the vector data is specified multiple times in user_vector_data / content_vector_data with its respective vector transformation. + +#### Optional Variables: Schemas & Support Files + +Each vector can specify optional variables that can assist in process speed or vector transformation: +* **user_vector_schemas** / **content_vector_schemas**: Specifying a schema for each data can speed up the reading process of the JSON file. Again, user_vector_schemas and content_vector_schemas have a one-to-one relationship with user_vector_data and content_vector_data respectively, meaning user_vector_schemas at index 0 applies to user_vector_data at index 0; content_vector_schemas at index 0 applies to content_vector_data at index 0. +* **support_files**: Additional variables listed in the [datasets] section will be treated as support files. During the creation of a Vector, these support files will be passed in as a dictionary with the key as a variable and the value as the value received. Currently, it cannot take a list of values as its value. For example: if glove_model = /data/glove/glove.txt is an additional line listed under the [datasets] section, it will be passed in as a dictionary with glove_model as key and /data/glove/glove.txt as its value. + +### Recommenders + +user_recommenders take in a list of recommender algorithms that will be applied to all user_vector_data. + +content_recommenders take in a list of recommender algorithms that will be applied to all content_vector_data. + +### Metrics + +metrics take in a list of metrics that will be applied to all data, including both user_vector_data and content_vector_data, after recommender algorithms have been applied to them. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/data_supported.md b/docs/data_supported.md new file mode 100644 index 0000000..674cce4 --- /dev/null +++ b/docs/data_supported.md @@ -0,0 +1,164 @@ +# Datasets Supported + +* [Movie Lens](#movielens) + * [Configuration Files](#movielens-configuration-files) + * [Vector Transformation for User Vector](#movielens-vector-transformation-for-user-vector) + * [Vector Transformation for Content Vector](#movielens-vector-transformation-for-content-vector) +* [Wikipedia](#wiki) + * [Configuration Files](#wiki-configuration-files) + * [Vector Transformation for User Vector](#wiki-vector-transformation-for-user-vector) + * [Vector Transformation for Content Vector](#wiki-vector-transformation-for-content-vector) +* [Adding New Datasets](#adding-new-datasets) + + +Hermes currently supports the following dataset: + +Dataset | Location +------------- | ------------- +MovieLens | http://grouplens.org/datasets/movielens/ +Wikipedia | https://en.wikipedia.org/wiki/Wikipedia:Database_download#English-language_Wikipedia + +Additional datasets will be added in the future. + +If you have datasets not currently supported by Hermes, please follow the instructions in [Adding Additional Datasets](#adding-additional-datasets) section below. + +Before continuing, it might be beneficial if you understand the Hermes's framework by reading this [guide](https://github.com/Lab41/hermes/tree/master/docs/framework.md) first. + +## Movie Lens + +### Configuration Files +For JSON files derived from Movie Lens data, you need to specify the following: +* In configuration file, specify vectorizer = movielens +* In configuraiton file that lists all JSON files, specify section as [movielens] + +As long as the vectorizer check matches with the vectorizer given in the configuration files, Hermes will recognize it as a Movie Lens data. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance(). What is checked in isSameDataInstance() has to match the vectorizer exactly. If it did not, Hermes will throw an error message. In this case, vectorizer has to match "movielens" exactly to recognize that this is a Movie Lens data. + +### Vector Transformation for User Vector + +You can specify the vector transformation on a user vector by specifying user_vector_transformations as one of the followings: + +* *ratings*: This vector transformation transforms the data into the format of [(user_id, movie_id, rating)]. +* *pos_ratings*: This vector transformation transforms the data into the format of [(user_id, movie_id, rating)] and filters out only ratings that are greater than 3. Meaning, this vector transformation will list all positive ratings where we assume a rating of 4 or 5 is a positive one. +* *ratings_to_interact*: This vector transformation transforms the data into the format of [(user_id, movie_id, binary_rating)] where binary_rating will return a value of -1 if it has a rating 2 or less and a value of 1 if it has a rating 3 or more. + +### Vector Transformation for Content Vector + +You can specify the vector transformation on a content vector by specifying content_vector_transformations as one of the followings: + +* *genre*: This vector transformation transforms the data into the format of [(movie_id, [genre_1, genre_2, ..., genre_n])]. Meaning, this vector transformation will list the genres of the movie. + +## Wikipedia + +### Configuration Files +For JSON files derived from Wikipedia data, you need to specify the following: +* In configuration file, specify vectorizer = wiki +* In configuration file that lists all JSON files, specify section as [wiki] + +As long as the vectorizer check matches with the vectorizer given in the configuration files, Hermes will recognize it as a Wikipedia data. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance(). What is checked in isSameDataInstance() has to match the vectorizer exactly. If it did not, Hermes will throw an error message. In this case, vectorizer has to match "wiki" exactly to recognize that this is a Wikipedia data. + +#### Vector Transformation for User Vector + +You can specify the vector transformation on a user vector by specifying user_vector_transformations as one of the followings: + +* *num_edits*: This vector transformation transforms the data into the format of [(user_id, article_id, num_edits)] where num_edits counts the number of items a user modify an article. +* *any_interact*: This vector transformation transforms the data into the format of [(user_id, article_id, num_interact)] where num_interact shows the interaction the user has with an article. Even if the user edits the article more than once, this vector transformation considers the interaction the user has with the article as one. +* *num_edits_ceil*: This vector trasnformation transforms the data into the format of [(user_id, article_id, num_edits_with_ceiling)] where num_edits counts the number of items a user modify an article and selects the max between num_edits and 5. + +#### Vector Transformation for Content Vector + +You can specify the vector transformation on a content vector by specifying content_vector_transformations as one of the followings: + +* *glove*: Explanation will be provided once implemented. (TODO: in development) +* *category_map*: Explanation will be provided once implemented. (TODO: in development) + + +## Adding New Datasets + +Currently, adding new dataset will require you to append the logic (see template below) in hermes/hermes/modules/vectorgenerator.py. To make it easier for the user, in the future, every time you add a new dataset, you will need to create a new file. The template for supporting an additional dataset is shown below. + +Template: + +```bash +class NewDataset(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.vectorizer == "new_dataset_vectorizer_name" + +class NewDatasetUserVector(UserVector, NewDataset): + def user_vector_transformation_1(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) + + def user_vector_transformation_2(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)).filter(lambda (u, m, r): r > 3) + + def user_vector_transformation_n(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, -1 if row.rating < 3 else 1)) + +class NewDatasetContentVector(ContentVector, NewDataset): + def content_vector_transformation_1(self): + def internal_helper_function(row): + return np.array(( + int(row.genre_action), + int(row.genre_adventure), + int(row.genre_animation), + )) + return self.data.dataframe.map(lambda row: (row.movie_id, internal_helper_function(row))) + +``` + +1. Instantiate a class for your dataset. In this case, it is specified as class NewDataset. +2. Instantiate a User Vector and a Content Vector class for your dataset that inherits from your dataset class and UserVector or Content Vector respectively. In this case, the UserVector for NewDataset is called NewDataSetUserVector, and the ContentVector for NewDataset is called NewDataContentVector. +3. Provide the vectorizer name for the check in isSameDataInstance(). In this case, vectorizer is checked if it's equal to "new_dataset_vectorizer_name". +4. Provide the vector transformation logic for each type of vectors. For User Vector transformations, define the function in the class NewDatasetUserVector. In this case, these vector transformations are user_vector_transformation_1, user_vector_transformation_2, and user_vector_transformation_n. For Content Vector transformations, define the function in the class NewDatasetContentVector. In this case, the vector transformation is content_vector_trasnformation_1. +5. Additional support files needed for the vector transformation is passed down from the configuration file as self.support_files. self.support_files is a dictionary with the key as a variable and the value as the value received in the configuration file. Please read on the [configuration file guide](https://github.com/Lab41/hermes/tree/master/docs/configs.md#optional-variables) for more details. + +After you have defined the concrete implementation of the new dataset, you can now use the dataset and apply multiple recommender system algorithms and metrics. + +In list_of_files.ini: +```bash +[new_dataset_vectorizer_name] +new_dataset_10m_ratings = /path/to/your/new/dataset/10m/ratings.json.gz +new_dataset_20m_ratings = /path/to/your/new/dataset/20m/ratings.json.gz +new_dataset_10m_ratings_schema = /path/to/your/new/dataset/10m/ratings_schema.json.gz +new_dataset_20m_ratings_schema = /path/to/your/new/dataset/20m/ratings_schema.json.gz + +new_dataset_10m_movies = /path/to/your/new/dataset/10m/movies.json.gz +new_dataset_10m_movies_schema = /path/to/your/new/dataset/10m/movies_schema.json.gz +``` + +In new_dataset_config.ini: +```bash +[datasets] +vectorizer = new_dataset_vectorizer_name + +# user vector +user_vector_data = ["new_dataset_10m_ratings", "new_dataset_20m_ratings"] +user_vector_schemas = ["new_dataset_10m_ratings_schema", "new_dataset_20m_ratings_schema"] +user_vector_transformations = ["user_vector_transformation_1", "user_vector_transformation_2"] + +# content vector +content_vector_data = ["new_dataset_10m_movies"] +content_vector_schema = ["new_dataset_10m_movies_schema"] +content_vector_transformations = ["content_vector_trasnformation_1"] + +[recommenders] +user_recommenders = ["ALS"] +content_recommenders = ["CBWithKMeans"] + +[metrics] +metrics = ["RMSE", "MAE"] +``` + +When you run hermes with the above configuration, the following will happen: +* user_vector_transformation_1 will be applied to new_dataset_10m_ratings. +* user_vector_transformation_2 will be applied to new_dataset_20m_ratings. +* content_vector_transformation_1 will be applied to new_dataset_10m_movies. +* ALS will be applied to UserVector of new_dataset_10m_ratings. +* ALS will be applied to UserVector of new_dataset_20m_ratings. +* CBWithKMeans will be applied to ContentVector of new_dataset_10m_movies. +* RMSE will be applied to UserVector of new_dataset_10m_ratings after ALS has been subjected to it. +* RMSE will be applied to UserVector of new_dataset_20m_ratings after ALS has been subjected to it. +* RMSE will be applied to ContentVector of new_dataset_10m_ratings after CBWithKMeans has been subjected to it. +* MAE will be applied to UserVector of new_dataset_10m_ratings after ALS has been subjected to it. +* MAE will be applied to UserVector of new_dataset_20m_ratings after ALS has been subjected to it. +* MAE will be applied to ContentVector of new_dataset_10m_ratings after CBWithKMeans has been subjected to it. diff --git a/docs/framework.md b/docs/framework.md new file mode 100644 index 0000000..f3a7692 --- /dev/null +++ b/docs/framework.md @@ -0,0 +1,560 @@ +# Understanding Hermes's Framework + +* [Command Line Utilities](#command-line-utilities) +* [General Framework Concepts](#general-framework-concepts) +* [Main Components](#main-components) + * [hermesctl.py](#hermesctlpy) + * [Revising Hermes's Version Number](#revising-hermess-version-number) + * [Revising What to Log](#revising-what-to-log) + * [Understanding Spark Context](#understanding-spark-context) + * [Adding New Global Variables](#adding-new-global-variables) + * [Adding New States in State Machine](#adding-new-states-in-state-machine) + * [Adding New Variables in Cargo](#adding-new-variables-in-cargo) + * [Adding and Extracting New Configuration Variables](#adding-and-extracting-new-configuration-variables) + * [Adding New Configuration Variables](#adding-new-configuration-variables) + * [Extracting New Configuration Variables](#extracting-new-configuraiton-variables) + * [hermes.py](#hermespy) + * [Currently Defined States](#currently-defined-states) + * [start_state](#start_state) + * [json_to_rdd_state](#json_to_rdd_state) + * [split_data_state](#split_data_state) + * [make_prediction_state](#make_prediction_state) + * [calculate_metrics_state](#calculate_metrics_state) + * [error_state](#error_state) + * [Handling Multiple Next States](#handling-multiple-next-states) + * [Defining a New State](#defining-a-new-state) + * [hermesui.py](#hermesuipy) + * [Adding Additional UI](#adding-additional-ui) +* [Helper Components](#helper-components) + * [singleton.py](#singletonpy) + * [globals.py](#globalspy) + * [helper.py](#helperpy) + * [Adding New Global Helper Function](#adding-new-global-helper-functions) + * [cargo.py](#cargopy) + * [config.py](#configpy) + * [data.py](#datapy) + * [Adding New Vector Type](#adding-new-vector-type) + * [vectorgenerator.py](#vectorgeneratorpy) + * [Understanding What Vectors Are](#understanding-what-vectors-are) + * [Adding New Vector Type](#adding-new-vector-type-1) + * [Adding New Dataset](#adding-new-dataset) + * [Adding New Vector Transformation](#adding-new-vector-transformation) + * [recommendergenerator.py](#recommendergeneratorpy) + * [Adding New Recommender System Algorithms](#adding-new-recommender-system-algorithms) + * [Implementing a Different Use Case for a Specific Recommender System Algorithm](#implementing-a-different-use-case-for-a-specific-recommender-system-algorithm) + * [metricgenerator.py](#metricgeneratorpy) + * [Adding New Metric](#adding-new-metric) + * [statemachine.py](#statemachinepy) + * [timer.py](#timerpy) + +## Command Line Utilities + +Hermes uses Click as its command line utilities. To learn what parameters Hermes take for the command line, please read the guide [How to Run Hermes] +(https://github.com/lab41/hermes/tree/master/docs/run.md). + +## General Framework Concepts + +The goal of Hermes is to give user the ability to run multiple recommender system algorithms and metrics on a particular dataset to determine which recommender system works best for this dataset. For this reason, we want to make the framework as modular as possible so that user can implement his/her own recommender system algorithms or performance metrics as needed if they were not yet implemented by default, asssuming that the target user is a data scientist. + +Hermes relies on a state machine as its framework. The beauty of the state machine is that state machine allows modularity. Each state represents a particular functionality, and states do not have to follow a singular path. This means that each state has the option to go to multiple different states for its next state depending on the context it was placed. + +Currently, Hermes has 5 states defined; they are start_state, json_to_rdd_state, split_data_state, make_prediction_state, and calculate_metrics_state. These states make up a state machine that follows this particular path (which can be subjected to change): + +```bash +start_state -> json_to_rdd_state -> split_data_state -> make_prediction_state -> calculate_metrics_state +``` + +Details of what each state does is explained in [hermes.py](#hermespy). + +Reading this entire article will give you the complete understanding of what the framework does. But if you wanted a TL;DR version, please check out the following: +* If you do not know a particular term used in Hermes, please check out the glossary: + * [Glossary](https://github.com/Lab41/hermes/tree/master/docs/glossary.md) +* If you are planning to change the flow of the state machine, please read: + * [Adding New States in State Machine](#adding-new-states-in-state-machine) + * [Defining a New State](#defining-a-new-state) + * [Handling Multiple Next States](#handling-multiple-next-states) +* If you are planning to use your own dataset not yet supported by Hermes, please read: + * [Understanding What Vectors Are](#understanding-what-vectors-are) + * [Datasets Supported](https://github.com/Lab41/hermes/tree/master/docs/data_supported.md), in particular Adding New Datasets section. +* If you are planning to use your own recommender system algorithms not yet supported by Hermes, please read: + * [Recommender System Algorithms Supported](https://github.com/Lab41/hermes/tree/master/docs/recommenders_supported.md), in particular Adding New Recommender System Algorithm section. +* If you are planning to use your own metrics not yet supported by Hermes, please read: + * [Metrics Supported](https://github.com/Lab41/hermes/tree/master/docs/metrics_supported.md), in particular Adding New Metric section. + +## Main Components + +Hermes has three main components: hermesctl.py, hermes.py, and hermesui.py. +* hermesctl.py is the entry point; it also handles instantiation. +* hermes.py defines every state in the state machine. +* hermesui.py defines the command line UI used in hermes.py. + +### hermesctl.py + +**Path: hermes/hermes/hermesctl.py** + +When you run the hermes binary, it will call on the main() function found in hermesctl.py. + +hermesctl.py is responsible for +* printing Hermes's version number +* initializing global varibles +* instantiating state machines +* creating cargo used in state machines +* parsing the configuration files +* running state machine + +#### Revising Hermes's Version Number + +You can check Hermes's version number by running +```bash +$ hermes --version +``` + +Team members revise the version number found in `hermes/hermes/__init__.py.` + +#### Revising What to Log + +We employ the logging library to log INFO, DEBUG, and ERROR messages. The logger is a global variable with the name "hermes". + +All INFO messages are outputted to the command line. + +ALL DEBUG messages are outputted to the command line and a log file called hermes.log. hermes.log is created wherever the hermes binary is run. Debug messages will only print when the --verbose option is passed. + +ALL ERROR messages are outputted to the command line and stderr. + +#### Understanding Spark Context + +Spark Context will not be instantiated if you run the framework in an iPython notebook (TODO: in development). + +Otherwise, it is wrapped in a singleton pattern to avoid multiple instantiation with the app name of "hermes". The singleton wrapper is defined in [singleton.py](#singletonpy) + +#### Adding New Global Variables + +Global variables are defined in [globals.py](#globalspy) and instantiated in hermesctl's main(). + +To add a new global variable, please define it in the Globals class in [globals.py](#globalspy). + +A list of what global variables are currently defined can be found in [globals.py](#globalspy). + +#### Adding New States in State Machine + +You can add a new state to Hermes in hermesctl's add_states() function, but you need to define what the state does (including where it needs to go next) in [hermes.py](#hermespy). If the new state is an end state, meaning there is no other state to go to next, you have to specify that it is an end state. + +To add a state, add the following line in hermesctl's add_states(): +```bash +state_machine.add_state(hermes.new_state) +``` + +To add an end state, add the following line in hermesctl's add_states(): +```bash +state_machine.add_state(hermes.new_state, isEndState=True) +``` + +#### Adding New Variables in Cargo + +Cargo is the object passed around in the state machine. Since we can never know until runtime where each state has derived from and where it will go next, we do not know what parameters to pass into each state. Cargo encapsulates all the parameters needed for each state in one object. It is defined in [cargo.py](#cargopy) and instantiated in hermesctl's main(). Future implementation will clean up Cargo so that one state does not know what another state's parameter needs are unless necessary (TODO: in development). + +To add a new variable in cargo for use in your newly defined state, please define it in the constructor of the Cargo class in [cargo.py](#cargopy). + +A list of what variables are currently defined in cargo can be found in [cargo.py](#cargopy). + +#### Adding and Extracting New Configuration Variables + +Configuration Files are currently extracted via the ConfigParser library. In the future, we might use ConfigObj as it supports subsections, which ConfigParser does not support (TODO: in development). + +Listed below are recognized sections and their respective items: +* datasets + * vectorizer + * user_vector_data + * user_vector_transformations + * user_vector_schemas + * content_vector_data + * content_vector_transformations + * content_vector_schemas +* recommenders + * recommenders +* metrics + * metrics + +What Hermes will do when it encounters unrecognized section or section's item: +* If it does not recognize the section, it will skip the entire section. +* In datasets section, if vectorizer is not specified, it will quit the program. +* In datasets section, if User Vector (user_vector_data, user_vector_transformation) or Content Vector (content_vector_data, content_vector_transformation) or both are not specified, it will quit the program. In the future, it will also quit the program if it does not have User Vector and Content Vector specified when Content Vector is already specified (TODO: in development). +* Any other items in datasets that are not recognized are treated as a support_file item, meaning the variable is placed as a key and its value is placed as a value in a dictionary called support_files to be used later when generating the vector. +* In recommenders section, any items that are not recognized will be skipped. In the future, extra parameter variables needed for recommender system algorithms will be recognized (TODO: in development). +* In metrics section, any items that are not recognized will be skipped. In the future, extra parameter variables needed for calculating the metrics will be recognized (TODO: in development). + +Note that in datasets section, if user_vector_data and user_vector_transformations are defined in the configuration file, hermesctl.py will store these values inside a UserVector Data object. Similarly, if content_vector_data and content_vector_transformations are defined in the configuration file, hermesctl.py will store these values inside a ContentVector Data object. All Data objects are then placed inside Cargo's data list. + +##### Adding New Configuration Variables + +Add any [new_section] in the configuration file. Add any new section's items underneath the [new_section] in the configuration file as needed. + +##### Extracting New Configuration Variables + +To make your new section and its items recognizable, add them in [configs.py](#configspy)'s HEADINGS variable. + +Handle its extractions in hermesctl's extract_configs() function. For handling the pecularities of the section, follow the example of the datasets section. For handling the recognized and unrecognized section items, handle it in extract_configs()'s helper function handle_recognized_section_item() and handle_unrecognized_section_item() respectively. + +### hermes.py + +**Path: hermes/hermes/hermes.py** + +hermes.py defines all functions for all states in the state machine. + +#### Currently Defined States + +##### start_state + +start_state creates the HDFS directory specified by the user (if the user does not specify it, the default is datasets) and loads all JSON files into this HDFS directory. + +##### json_to_rdd_state + +json_to_rdd_state converts the JSON file into its respective RDD or Vectors. + +##### split_data_state + +split_data_state splits the data in Vector into train data, test data, and validation data depending on the input given by the user at runtime. + +##### make_prediction_state + +make_prediction_state takes the train data from each Vector, develop model based on the train data and the recommender in configuration file, and make prediction based on this model. + +##### calculate_metrics_state + +calculate_metrics test the metrics specified in the configuration file. This is an end state. + +##### error_state + +error_state is where states go when they encounter an error. This is an end state. + +#### Handling Multiple Next States + +If you wanted a state to go to multiple next states, define the switch in the state of interest and make sure you return newState and cargo with the correct next state (name of the state function) and necessary parameters initialized or added to cargo. + +#### Defining a New State + +Defining a new state is the same as defining a function in hermes.py. Make sure you add the new state into the state machine by following the instructions in [Adding New States in State Machine](#adding-new-states-in-state-machine). + +### hermesui.py + +**Path: hermes/hermes/hermesui.py** + +hermesui.py defines all the command line user interface used in hermes.py. + +#### Adding Additonal UI + +Most configuration can be addressed using the configuration file. However, if you needed to ask the user for a configuration at runtime, define the UI function in hermes.py and call it as needed in the required state. + +## Helper Components + +### singleton.py + +**Path: hermes/modules/singleton.py** + +SCSingleton is a singleton pattern object that wraps the Spark Context to avoid multiple instantiation of the Spark Context. + +### globals.py + +**Path: hermes/modules/globals.py** + +Listed below are the currently defined global variables: +* verbose: a boolean variable that prints out debug log messages +* logger: logging object that logs messages +* scsingleton: singleton object that defines the Spark Context + +To add a new global variable, please see [Adding New Global Variables](#adding-new-global-variables). + +### helper.py + +**Path: hermes/modules/helper.py** + +helper.py defines all global helper functions used in multiple places throughout the framework. + +#### Adding New Global Helper Function + +To add a new global helper function, create the function in helper.py and import helper.py to the necessary file. + +### cargo.py + +**Path: hermes/modules/cargo.py** + +Cargo is the object passed around in the state machine. Since we can never know until runtime where each state has derived from and where it will go next, we do not know what parameters to pass into each state. Cargo encapsulates all the parameters needed for each state in one object. + +Listed below are the currently defined cargo variables: +* hdfs_dir: Name of HDFS directory to store input data. One of the option passed in when running hermes binary. Default = datasets. +* fs_default_ip_addr: IP address of fs.default.name used in HDFS. One of the arguments passed in when running hermes binary. Default = localhost:9000. +* datas: List of Data objects initialized when extracting the configuration file. +* vectors: List of Vector objects initialized during one of the states in the state machine, json_to_rdd_state. +* support_files: Unrecognized items in [datasets] section of the configuration file that is presumed to be support files for the creation of a Vector. +* recommenders: List of recommender system algorithms initialized when extracting the configuration file. +* metrics: List of metrics initialized when extracting the configuration file. +* error_msg: It starts out as an empty string that will be initialized as an error message to the error state. + +To add a new variable in cargo, please see [Adding New Variables in Cargo](#adding-new-variables-in-cargo). + +### config.py + +**Path: hermes/modules/config.py** + +config.py has a list of recognized section and section's items used in the parsing of the configuration file. It also has functions defined to assist in the parsing of the configuration file. + +### data.py + +**Path: hermes/modules/data.py** + +Class Data is defined in data.py to store the configurations specified in the configuration file. We have not decided whether or not this is the best way to store configurations from the configuration file. (TODO: in development) + +Currently, it has a subclass called UserVectorData nad ContentVectorData to differentiate the two different Vector Types that Hermes supports. + +#### Adding New Vector Type + +Hermes has two vector types: UserVector and ContentVector. If you wanted to add a new vector type, you will need to follow the instructions in [Adding New Vector Type](#adding-new-vector-type-1) under the vectorgenerator.py as well as add its respective Data object for storing its configuration. + +### vectorgenerator.py + +**Path: hermes/modules/vectorgenerator.py** + +#### Understanding What Vectors Are + +In Hermes, when we referenced a vector, it refers to a dataframe that has been converted to a RDD after a transformation occurs. This transformation is specified by the vector transformation. For example, if you have Movie Lens data and you wanted to build a user vector from this data, if you specified the vector transformation to be "ratings" in the configuration file, the data from the JSON file is transformed into a dataframe and then a RDD of [(user_id, item_id, rating)]. In other words, the output of this transformation is a vector of [(user_id, item_id, rating)]. + +There are two types of vectors: User Vector and Content Vector. User Vector refers to the vector describing users in the data. Content Vector refers to the vector describing content in the data. Collaborative Filtering Recommender System typically uses only User Vector, and Content Based Recommender System typically uses both User Vector and Content Vector, but this does not have to be the case. + +Every vector type inherits from the Vector class, meaning all User Vector and Content Vector will have the following variables: +* data: a Data object containing the configuration for this particular vector from the configuration file +* support_files: list of unrecognized variables in [datasets] section of the configuration file that we assume is a support file for the creation of a Vector +* vector_transformation: transformation needed to convert data from a JSON file to a specified vector +* training_vector: part of the vector that is split for training +* test_vector: part of the vector that is split for test +* validation_vector: part of the vector that is split for validation +* prediction_vector: part of the vector that is predicted based on test_vector and the model that is created from training_vector + +Since each data requires its own specific vector transformation, every data has its own class as well as its own UserVector and ContentVector. The data's UserVector and ContentVector inherit from both the data's own class as well as UserVector or ContentVector respectively. The data's UserVector and ContentVector have functions defined in their class to execute vector transformation. The name of these functions has to match the name of the vector transformation passed in via the configuration file in order for the vector transformation to occur. + +Vectorizer is a variable used in configuration file to refer to the data where each JSON file is coming from. The data's own class has a check function called isSameDataInstance() to verify that the vectorizer passed in via the configuration file is describing about the same data as data's own class. + +To automatically create a vector (ie. which vector type and from which data), VectorFactory is there to the rescue! It can either return a Vector object or the RDD / vector itself by calling VectorFactory().create_obj_vector(...) or VectorFactory().create_vector(...) respectively. + +#### Adding New Vector Type + +UserVector and ContentVector are two vector types supported in Hermes. If you wanted to add a new vector type, create a class for your new vector type that inherits the Vector class. Add additional variables and functions as needed to the class. + +```bash +class MyNewVectorType(Vector): + pass +``` + +#### Adding New Dataset + +Please read [Datasets Supported's section on Adding New Datasets](https://github.com/Lab41/hermes/tree/master/docs/data_supported.md#adding-new-datasets). + +#### Adding New Vector Transformation + +To add a new vector transformation, go to the data class itself and decide which vector type it is. Under the class of the vector type, define the new vector transformation as a class function. + +For example: if you wanted to create a vector transformation for MovieLens data's UserVector, do the following: +```bash +class MovieLens(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.vectorizer == "movielens" + +class MovieLensUserVector(UserVector, MovieLens): + def ratings(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) + + def new_vector_transformation: + # your defined vector transformation + ... + return vector_after_the_transformation + +``` +Except instead of naming the new function as new_vector_trasnformation, name it according to what you want to use in the configuration file. + +### recommendergenerator.py + +**Path: hermes/modules/recommendergenerator.py** + +recommendergenerator.py is built to allow clearer execution of code using the bridge pattern. For example, let's try to create a model using ALS. To do so, we can execute the following: + +```bash +import recommendergenerator as rg + +recommender = rg.ALS(vector) +``` + +We can then make a prediction because the Recommender object already knows about the recommender system algorithm to use as well as the training and test data from the parameter vector that we passed in. + +```bash +prediction_vector = recommender.make_prediction() +``` + +If we have a specific use case that is different than the normal ALS use case, we can define that abnormal use case for ALS and call it as follows: + +```bash +abnormal_usecase = AbnormalUseCase() +recommender = ALS(abnormal_usecase) +prediction_vector = recommender.make_prediction() +``` + +We do not have to call the make_prediction() function differently. We just call make_prediction() because it will call make_prediction() specifically for the abnormal use case. + +Also, when you change the recommender system algorithm, say for example CBWithKMeans, all you need to do is create this Recommender object and when you are ready to make your prediction, call make_prediction() because it will make sure that behind the scene, it will call CBWithKMeans's make_prediction(). + +```bash +recommender = CBWithKMeans() +prediction_vector = recommender.make_prediction() +``` + +#### Adding New Recommender System Algorithms + +To add a new recommender system algorithm, instantiate a class that inherits from Recommender class and defines the make_prediction() function that calls on the recommender system algorithm's own make prediction function. + +```bash +class NewRecommenderSystemAlgorithm(Recommender): + def make_prediction(self) + return self.implementation.make_prediction_with_new_recommender_system_algorithm(self.vector) +``` + +self.implementation is the use case that you want to use. The default use case is the Normal class. If you have another use case, for example: an abnormal use case, you want to instantiate a class called Abnormal, for example, that inherits from ImplementationInterface. + +So let's do that, let's define an abnormal use case. +```bash +class Abnormal(ImplementationInterface): + pass +``` + +Let's say we want to define the make_prediction() function for both normal and abnormal use case. Therefore, the first thing we need to do is define the make_prediction() function for our new recommender system algorithm in the ImplementationInterface so that in case there is another use case that does not implement our new recommender system algorithm's make_prediction() function, it will fail by raising a NotImplemented error. + +```bash +class ImplementationInterface(object): + def make_prediciton_with_als(self): + raise NotImplemented + + def make_prediction_with_cbwithkmeans(self): + raise NotImplemented + + def make_prediction_with_new_recommender_system_algorithm(self): + raise NotImplemented +``` + +After you defined in the ImplementationInterface class, you also want to define it in Normal class. + +```bash +class Normal(ImplementationInterface): + def make_prediction_with_als(self): + ... + return prediciton_vector + + def make_prediction_with_cbwithkmeans(self): + ... + return prediction_vector + + def make_prediction_with_new_recommender_system_algorithm(self): + # implement your make_prediction() for the normal use case + return prediciton_vector +``` + +Now begin implementing it in your Abnormal class too. +```bash +class Abnormal(ImplementationInterface): + def make_prediction_with_new_recommender_system_algorithm(self): + # implement your make_prediction() for the abnormal use case + return prediction_vector +``` + +You are done. :) + +#### Implementing a Different Use Case for a Specific Recommender System Algorithm + +Let's do this with the ALS recommender system algorithm. We want to create an abnormal use case. To do so, we need to instantiate the Abnormal class that inherits from ImplementationInterface. + +```bash +class Abnormal(ImplementationInterface): + pass + +``` + +Since ALS's make_prediction() function is already defined in the normal use case, we just need to define it also in the abnormal use case with the abnormal use case's implementation. + +```bash +class Abnormal(ImplementationInterface): + def make_prediction_with_new_recommender_system_algorithm(self): + # implement your make_prediction() for the abnormal use case + return prediction_vector +``` + +You are done. :) + +### metricgenerator.py + +**Path: hermes/modules/metricgenerator.py** + +metricgenerator.py is also built to allow clearer execution of code using the strategy pattern. You have MetricExceutor that executes different types of metrics and change the metrics during runtime. + +For example: we want to execute RMSE and then execute PRFS with different vectors. + +```bash +exeggutor = MetricExecutor(RMSE()) +print exeggutor.execute(vector1) +print exeggutor.execute(vector2) +exeggutor = MetricExecutor(PRFS) +print exeggutor.execute(vector1) +print exeggutor.execute(vector2) +``` + +MetricFactory() is a class that will automatically instantiate which metric dependent on what is specified in the configuration file. + +#### Adding New Metric + +To add a new metric, create a class that inherits from the Metric class and define a calculate_metric function in the class. + +```bash +class MyCoolNewMetric(Metric): + def calculate_metric(self, vector): + # calculate your cool new metric here + # or + # define your cool new metric in hermes/metrics/performance_metrics.py + return metrics.performance_metrics.calculate_my_cool_new_metric(vector.test_vector, vector.prediction_vector) +``` + +### statemachine.py + +**Path: hermes/modules/statemachine.py** + +statemachine.py defines the concrete implementation of the state machine. + +Here is how you can use a state machine: +```bash +# state1 -> state2 -> state3a +# -> state3b +# where state1, state2, state3a, and state3b are defined functions. + +import StateMachine + +sm = StateMachine() +sm.add_state(state1) +sm.add_state(state2) +sm.add_state(state3a, isEndState=True) +sm.add_state(state3b, isEndState=True) +sm.set_start(state1) +sm.run() + +# or if you have cargo defined, instead of sm.run(), you can do the following: +# sm.run(Cargo()) +``` + +### timer.py + +**Path: hermes/modules/timer.py** + +timer.py defines a Timer Class where you can use anywhere in the code to time how long a particular function runs. + +For example: if you wanted to time how long somefunction() runs, do the following: +```bash +import Timer + +with Timer() as t: + somefunction() +print("somefunction() takes %s seconds or %s milliseconds" % (t.secs, t.msecs)) +``` diff --git a/docs/glossary.md b/docs/glossary.md new file mode 100644 index 0000000..e9ad1fe --- /dev/null +++ b/docs/glossary.md @@ -0,0 +1,82 @@ +# Glossary + +This is a glossary of common terms used in Hermes and their specified meaning. + +## A + +## B + +## C +**Content Vector**: Content Vector refers to the vector describing the content in the data. + +## D +**Dataframe**: A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. (Excerpt taken from Spark's SQL Programming Guide). In Hermes, the dataframe variable defined in the Data class refers to the dataframe created after reading in the JSON file. + +## E + +## F + +## G + +## H + +## I + +## J + +## K + +##L + +## M + +**Metrics**: See "Performance Metrics". + +## N + +## O + +## P + +**Performance Metrics**: Performance Metrics allows user to evaluate a recommender and how much a recommender adds value to the end user. + +## Q + +## R +**RDD**: Resilient Distributed Dataset or RDD is the basic abstraction in Spark that represents an immutable, partitioned collection of elements that can be operated on in parallel. (Excerpt taken from Spark's man page about RDD). + +**Recommender**: See "Recommender System Algorithms". + +**Recommender System Algorithms**: Hermes use Recommender System Algorithms to build a model based on the train data and make a prediction based on the test data. + +## S + +## T +**Tradespace**: It is the space spanned by completely enumerated design variables, which means given a set of design variables, the tradespace is the space of possible design options. (Excerpt taken from Adam M. Ross & Daniel E. Hasting's "The Tradespace Exploration Program") + +**Test Data**: Data is usually split into train data, test data, and validation data. After you have used the train data to build a model and validation data to select the best performing model out of all the models, you use test data to estimate the accuracy of the selected approach. In other words, you want to estimate how well your model has been trained. + +**Train Data**: Data is usually split into train data, test data, and validation data. Train data is used by a recommender to build a model by pairing the input with the expected output. + + +## U +**User Vector**: User Vector refers to the vector describing users in the data. + +## V +**Validation Data**: Data is usually split into train data, test data, and validation data. Validation data is used to select which is the best performing model out of all the models you trained with the train data. Sometimes validation data is optional. + +**Vector**: In Hermes, when we referenced a vector, it refers to a dataframe that has been converted to a RDD after a transformation occurs. This transformation is specified by the vector transformation. For example, in the case of a user vector, if the vector transformation is "ratings" for Movie Lens data, the data from the JSON file is transformed into a RDD of [(user_id, item_id, rating)]. The output of this transformation is a vector of [(user_id, item_id, rating)]. + +**Vector Transformation**: In Hermes, vector transformation refers to the transformation needed to convert data from a JSON file to a specified vector. Please see **Vector** for more details. + +**Vector Type**: Hermes separates vectors into two distinct types: User Vector and Content Vector. User Vector refers to the vector describing users in the data. Content Vector refers to the vector describing content in the data. Users can implement other vector types as needed if User Vector and Content Vector does not describe the vector they are building. + +**Vectorizer**: Vectorizer is a variable used in configuration file to refer to the data where each JSON file is coming from. + +## W + +## X + +## Y + +## Z \ No newline at end of file diff --git a/docs/installation.md b/docs/installation.md index 2c559f3..f8c8bc7 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -1,68 +1,70 @@ -## Hermes Installation Guide +# Hermes Installation Guide -### Dependencies: +## Dependencies: * Spark 1.5.1 * Scala 2.11.7 * Pyspark 0.8.2.1 * Hadoop 2.7.1 * virtualenv -### How to Install Dependencies on Mac OS X: -#### Installing Spark, Scala, and PySpark +## How to Install Dependencies on Mac OS X: +### Installing Spark, Scala, and PySpark 1. Install Java 1. Download 2. Double click on .dmg file to install. 3. In a terminal, type java -version. You should see the following: -` +```bash java version "1.8.0_65" Java(TM) SE Runtime Environment (build 1.8.0_65-b17) Java HotSpot(TM) 64-Bit Server VM (build 25.65-b01, mixed mode) -` +``` 2. Set JAVA_HOME +```bash export JAVA_HOME=$(/usr/libexec/java_home) +``` 3. Install Homebrew -` -ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" -` +```bash +$ ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" +``` 4. Install Scala -` -brew install scala -` +``` +$ brew install scala +``` 5. Download Spark from https://spark.apache.org/downloads.html. 6. Set SCALA_HOME and SPARK_HOME and export it to path in your .bash_profile. -` +```bash export SPARK_HOME=/path/to/your/spark export PATH=$PATH:$SPARK_HOME/bin export SCALA_HOME=/path/to/your/scala export PATH=$PATH:$SCALA_HOME/bin -` +``` 7. Export PySpark classes to the Python path after you have installed Python. -` +```bash export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH -` +``` 8. Build and install Apache Spark -` -brew install sbt -cd $SPARK_HOME -sbt/sbt clean assembly -` +```bash +$ brew install sbt +$ cd $SPARK_HOME +$ sbt/sbt clean assembly +``` -#### Installing Hadoop +### Installing Hadoop Please follow this [guide](http://zhongyaonan.com/hadoop-tutorial/setting-up-hadoop-2-6-on-mac-osx-yosemite.html). -#### Installing virtualenv +### Installing virtualenv Please read this [guide](http://docs.python-guide.org/en/latest/dev/virtualenvs/) for more details. -` -pip install virtualenv -` +```bash +$ pip install virtualenv +``` -### How to Install Hermes: +## How to Install Hermes: (Optional) After you have installed the dependencies, if you have different projects that require different Python environment, you can use a Virtual Environment. As listed in the Virtual Environment's [site](http://docs.python-guide.org/en/latest/dev/virtualenvs/), "a Virtual Environment is a tool to keep the dependencies required by different projects in separate places, by creating virtual Python environments for them." diff --git a/docs/metrics_supported.md b/docs/metrics_supported.md new file mode 100644 index 0000000..e7d1cc9 --- /dev/null +++ b/docs/metrics_supported.md @@ -0,0 +1,51 @@ +# Metrics Supported + +* [RMSE](#rmse) +* [MAE](#mae) +* [PRFS](#prfs) +* [Adding New Metric](#adding-new-metric) + +This excerpt is taken out from [Understanding Hermes's Framework](https://github.com/Lab41/hermes/tree/master/docs/framework.md#metricgeneratorpy). It will be helpful if you read this guide first. + +metricgenerator.py is also built to allow clearer execution of code using the strategy pattern. You have MetricExceutor that executes different types of metrics and change the metrics during runtime. + +For example: we want to execute RMSE and then execute PRFS with different vectors. + +```bash +exeggutor = MetricExecutor(RMSE()) +print exeggutor.execute(vector1) +print exeggutor.execute(vector2) +exeggutor = MetricExecutor(PRFS) +print exeggutor.execute(vector1) +print exeggutor.execute(vector2) +``` + +MetricFactory() is a class that will automatically instantiate which metric dependent on what is specified in the configuration file. + +## RMSE + +Explanation of what RMSE does will be provided in the future. (TODO) + +## MAE + +Explanation of what MAE does will be provided in the future. (TODO) + +## PRFS + +Explanation of what PRFS does will be provided in the future. (TODO) + + +#### Adding New Metric + +This excerpt is taken out from [Understanding Hermes's Framework](https://github.com/Lab41/hermes/tree/master/docs/framework.md#adding-new-metric). + +To add a new metric, create a class that inherits from the Metric class and define a calculate_metric function in the class. + +```bash +class MyCoolNewMetric(Metric): + def calculate_metric(self, vector): + # calculate your cool new metric here + # or + # define your cool new metric in hermes/metrics/performance_metrics.py + return metrics.performance_metrics.calculate_my_cool_new_metric(vector.test_vector, vector.prediction_vector) +``` \ No newline at end of file diff --git a/docs/recommenders_supported.md b/docs/recommenders_supported.md new file mode 100644 index 0000000..6eba87b --- /dev/null +++ b/docs/recommenders_supported.md @@ -0,0 +1,117 @@ +# Recommender Algorithms Supported + +* [ALS](#als) + * [Use Cases Supported](#use-cases-supported) +* [Content Base with K-Means](#content-base-with-k-means) + * [Use Cases Supported](#use-cases-supported-1) +* [Adding New Recommender System Algorithms](#adding-new-recommender-system-algorithms) + +This excerpt is taken out from [Understanding Hermes's Framework](https://github.com/Lab41/hermes/tree/master/docs/framework.md#recommendergeneratorpy). It will be helpful if you read this guide first. + +recommendergenerator.py is built to allow clearer execution of code using the bridge pattern. For example, let's try to create a model using ALS. To do so, we can execute the following: + +```bash +import recommendergenerator as rg + +recommender = rg.ALS(vector) +``` + +We can then make a prediction because the Recommender object already knows about the recommender system algorithm to use as well as the training and test data from the parameter vector that we passed in. + +```bash +prediction_vector = recommender.make_prediction() +``` + +If we have a specific use case that is different than the normal ALS use case, we can define that abnormal use case for ALS and call it as follows: + +```bash +abnormal_usecase = AbnormalUseCase() +recommender = ALS(abnormal_usecase) +prediction_vector = recommender.make_prediction() +``` + +We do not have to call the make_prediction() function differently. We just call make_prediction() because it will call make_prediction() specifically for the abnormal use case. + +Also, when you change the recommender system algorithm, say for example CBWithKMeans, all you need to do is create this Recommender object and when you are ready to make your prediction, call make_prediction() because it will make sure that behind the scene, it will call CBWithKMeans's make_prediction(). + +```bash +recommender = CBWithKMeans() +prediction_vector = recommender.make_prediction() +``` + +## ALS + +Explanation of what ALS does will be provided in the future. (TODO) + +### Use Cases Supported + +* Normal Use Case + +## Content Base with K-Means + +Explanation of what Content Base with K-Means will be provided in the future. (TODO) + +### Use Cases Supported + +* Normal Use Case + +#### Adding New Recommender System Algorithms + +This excerpt is taken out from [Understanding Hermes's Framework](https://github.com/Lab41/hermes/tree/master/docs/framework.md#adding-new-recommender-system-algorithms). + +To add a new recommender system algorithm, instantiate a class that inherits from Recommender class and defines the make_prediction() function that calls on the recommender system algorithm's own make prediction function. + +```bash +class NewRecommenderSystemAlgorithm(Recommender): + def make_prediction(self) + return self.implementation.make_prediction_with_new_recommender_system_algorithm(self.vector) +``` + +self.implementation is the use case that you want to use. The default use case is the Normal class. If you have another use case, for example: an abnormal use case, you want to instantiate a class called Abnormal, for example, that inherits from ImplementationInterface. + +So let's do that, let's define an abnormal use case. +```bash +class Abnormal(ImplementationInterface): + pass +``` + +Let's say we want to define the make_prediction() function for both normal and abnormal use case. Therefore, the first thing we need to do is define the make_prediction() function for our new recommender system algorithm in the ImplementationInterface so that in case there is another use case that does not implement our new recommender system algorithm's make_prediction() function, it will fail by raising a NotImplemented error. + +```bash +class ImplementationInterface(object): + def make_prediciton_with_als(self): + raise NotImplemented + + def make_prediction_with_cbwithkmeans(self): + raise NotImplemented + + def make_prediction_with_new_recommender_system_algorithm(self): + raise NotImplemented +``` + +After you defined in the ImplementationInterface class, you also want to define it in Normal class. + +```bash +class Normal(ImplementationInterface): + def make_prediction_with_als(self): + ... + return prediciton_vector + + def make_prediction_with_cbwithkmeans(self): + ... + return prediction_vector + + def make_prediction_with_new_recommender_system_algorithm(self): + # implement your make_prediction() for the normal use case + return prediciton_vector +``` + +Now begin implementing it in your Abnormal class too. +```bash +class Abnormal(ImplementationInterface): + def make_prediction_with_new_recommender_system_algorithm(self): + # implement your make_prediction() for the abnormal use case + return prediction_vector +``` + +You are done. :) \ No newline at end of file diff --git a/docs/run.md b/docs/run.md new file mode 100644 index 0000000..8d2adf0 --- /dev/null +++ b/docs/run.md @@ -0,0 +1,49 @@ +# How to Run Hermes + +Hermes requires at least three arguments in order to run properly. +* fs_default_ip_addr: IP address of fs.default.name used in HDFS, ie. localhost:9000. +* list_of_files_config: A configuration file that lists all the json paths referenced by configs. +* configs: Users can provide an unlimited amount of configuration files that list what datasets to use and which recommender algorithms and metrics to apply to each dataset. + +For more details about list_of_files_config and configs, please read the [Configuration Files Guide](https://github.com/Lab41/hermes/tree/master/docs/configs.md). + +With one configuration file: +```bash +$ hermes localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini +``` + +With more than one configuration files: +```bash +$ hermes localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini ./hermes/configs/config2.ini +``` + +## Options + +The hermes binary can take in multiple options: +* --version +* --verbose +* --hdfs_dir + +### --version +--version displays the current hermes binary version number. The binary version number is located in hermes/hermes/__init__.py under the variable __version__. + +```bash +$ hermes --version +``` + +### --verbose +--verbose will print out all debug messages to help you debug the code. + +```bash +$ hermes --verbose localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini +``` + +### --hdfs_dir +--hdfs_dir requires you to pass in the name of the HDFS directory to store the input data given in the configuration files. The default name is set as "datasets". + +```bash +$ hermes --hdfs_dir datasets localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini +``` + + + diff --git a/hermes/__init__.py b/hermes/__init__.py index 61a202c..e43be03 100644 --- a/hermes/__init__.py +++ b/hermes/__init__.py @@ -1,2 +1 @@ -import hermes __version__ = '1.0' \ No newline at end of file diff --git a/hermes/configs/my_config.ini b/hermes/configs/movielens_config.ini similarity index 81% rename from hermes/configs/my_config.ini rename to hermes/configs/movielens_config.ini index 5d5bc57..265eaf2 100644 --- a/hermes/configs/my_config.ini +++ b/hermes/configs/movielens_config.ini @@ -2,15 +2,16 @@ vectorizer = movielens user_vector_data = ["movielens_10m_ratings"] user_vector_schemas = ["movielens_10m_ratings_schema"] -user_vector_types = ["ratings"] +user_vector_transformations = ["ratings"] #content_vector_data = ["movielens_10m_movies"] #content_vector_schemas = ["movielens_10m_movies_schema"] -#content_vector_types = ["genre"] +#content_vector_transformations = ["genre"] [recommenders] recommenders = ["ALS"] #user_recommenders = ["ALS"] #content_recommenders = [""] + [metrics] metrics = ["RMSE", "MAE"] \ No newline at end of file diff --git a/hermes/configs/config_1.ini b/hermes/configs/wiki_config.ini similarity index 79% rename from hermes/configs/config_1.ini rename to hermes/configs/wiki_config.ini index 64fd6ef..723a61a 100644 --- a/hermes/configs/config_1.ini +++ b/hermes/configs/wiki_config.ini @@ -5,9 +5,9 @@ output_directory = /output/wikipedia_cbkmeans [datasets] vectorizer = wiki user_vector_data = ["edit_history"] -user_vector_types = ["num_edits_ceil"] +user_vector_transformations = ["num_edits_ceil"] content_vector_data = ["full_text"] -content_vector_types = ["glove_model"] +content_vector_transformations = ["glove_model"] glove_model = /data/glove/glove.txt [recommenders] diff --git a/src/data_prep/movieLens_vectorize.py b/hermes/data_prep/movieLens_vectorize.py similarity index 100% rename from src/data_prep/movieLens_vectorize.py rename to hermes/data_prep/movieLens_vectorize.py diff --git a/src/data_prep/osm_vectoize.py b/hermes/data_prep/osm_vectoize.py similarity index 100% rename from src/data_prep/osm_vectoize.py rename to hermes/data_prep/osm_vectoize.py diff --git a/src/data_prep/wiki_vectorize.py b/hermes/data_prep/wiki_vectorize.py similarity index 100% rename from src/data_prep/wiki_vectorize.py rename to hermes/data_prep/wiki_vectorize.py diff --git a/hermes/hermes.py b/hermes/hermes.py index 1b2e840..7c0c83a 100644 --- a/hermes/hermes.py +++ b/hermes/hermes.py @@ -1,151 +1,150 @@ -"""Helper functions to hermesctl.py""" +"""Defined states in Hermes's state machine""" import json import logging import os import hermesui -import metrics.performance_metrics -import modules.datum -import modules.metricgenerator as metricgenerator -import modules.recommendergenerator as recommendergenerator -import modules.timer -import modules.vectorgenerator as vectorgenerator +import modules.metricgenerator as mg +import modules.recommendergenerator as rg +import modules.vectorgenerator as vg + +from modules.globals import Globals +from modules.timer import Timer # TODO: empty certain items in cargo after no longer needed? # TODO: when to use error_state? do try-catch for all states? -# get logger -logger = logging.getLogger("hermes") - def start_state(cargo): - """Start of the state machine. Create HDFS directory and upload the input data. - Returns: json_to_rdd_state as next state - """ + """Start of the state machine. Create HDFS directory and upload the input data. + Returns: json_to_rdd_state as next state + """ - if cargo.verbose: logger.debug("In start_state:") + if Globals.verbose: Globals.logger.debug("In start_state:") - if cargo.verbose: logger.debug("Creating the hdfs directory " + cargo.hdfs_dir) - os.system("hdfs dfs -mkdir " + cargo.hdfs_dir) + if Globals.verbose: Globals.logger.debug("Creating the hdfs directory " + cargo.hdfs_dir) + os.system("hdfs dfs -mkdir " + cargo.hdfs_dir) - def load_json_files(datas): - for i in range(0, len(datas)): - json_path = datas[i].datapath - if cargo.verbose: logger.debug("Loading JSON file " + json_path + " into hdfs directory " + cargo.hdfs_dir) - os.system("hdfs dfs -put " + json_path + " " + cargo.hdfs_dir + "/" + os.path.basename(json_path)) + def load_json_files(datas): + for i in range(0, len(datas)): + json_path = datas[i].datapath + if Globals.verbose: Globals.logger.debug("Loading JSON file " + json_path + " into hdfs directory " + cargo.hdfs_dir) + os.system("hdfs dfs -put " + json_path + " " + cargo.hdfs_dir + "/" + os.path.basename(json_path)) - load_json_files(cargo.datas) + load_json_files(cargo.datas) - newState = json_to_rdd_state - if cargo.verbose: logger.debug("start_state -> json_to_rdd_state") + newState = json_to_rdd_state + if Globals.verbose: Globals.logger.debug("start_state -> json_to_rdd_state") - return newState, cargo + return newState, cargo +# TODO: make json_to_rdd_state, split_data_state, and make_prediction_state into one state? def json_to_rdd_state(cargo): - """Parse JSON to RDD. - Returns: split_data_state as next state - """ + """Parse JSON to RDD. + Returns: split_data_state as next state + """ - if cargo.verbose: logger.debug("In json_to_rdd_state:") + if Globals.verbose: Globals.logger.debug("In json_to_rdd_state:") - # create RDD for each JSON file and store it in Cargo's vectors list - for i in range(0, len(cargo.datas)): - data = cargo.datas[i] - if cargo.verbose: logger.debug("Working with json file %s" % data.datapath) + # create RDD for each JSON file and store it in Cargo's vectors list + for i in range(0, len(cargo.datas)): + data = cargo.datas[i] + if Globals.verbose: Globals.logger.debug("Working with json file %s" % data.datapath) - if cargo.verbose: logger.debug("Creating dataframe based on the content of the json file") - datapath_in_hdfs = "hdfs://" + cargo.fs_default_ip_addr + "/" + cargo.hdfs_dir + "/" + os.path.basename(data.datapath) - data.set_dataframe(cargo.scsingleton, datapath_in_hdfs) + if Globals.verbose: Globals.logger.debug("Creating dataframe based on the content of the json file") + datapath_in_hdfs = "hdfs://" + cargo.fs_default_ip_addr + "/" + cargo.hdfs_dir + "/" + os.path.basename(data.datapath) + data.set_dataframe(Globals.scsingleton, datapath_in_hdfs) - if cargo.verbose: logger.debug("Creating RDD based on the computed dataframe and configuration provided by the user") - cargo.vectors.append( vectorgenerator.VectorFactory().create_obj_vector(cargo.scsingleton.sqlCtx, data, cargo.support_files) ) + if Globals.verbose: Globals.logger.debug("Creating RDD based on the computed dataframe and configuration provided by the user") + # TODO: remove sqlCtx since it's global? + cargo.vectors.append( vg.VectorFactory().create_obj_vector(Globals.scsingleton.sqlCtx, data, cargo.support_files) ) - # TODO: clean cargo? - # cargo.datas = [] - # cargo.hdfs_dir = None - # cargo.fs_default_ip_addr = None + # TODO: clean cargo? + # cargo.datas = [] + # cargo.hdfs_dir = None + # cargo.fs_default_ip_addr = None - newState = split_data_state - if cargo.verbose: logger.debug("json_to_rdd_state -> split_data_state") + newState = split_data_state + if Globals.verbose: Globals.logger.debug("json_to_rdd_state -> split_data_state") - return newState, cargo + return newState, cargo def split_data_state(cargo): - """Split data to train, test, and (optional) validate. - Returns: next state dependent whether or not it is using collaborative filtering or content based - """ + """Split data to train, test, and (optional) validate. + Returns: make_prediction_state as next state + """ - if cargo.verbose: logger.debug("In split_data_state:") + if Globals.verbose: Globals.logger.debug("In split_data_state:") - for i in range(0, len(cargo.vectors)): - vector = cargo.vectors[i] - weights, seed = hermesui._ask_user_for_split_percentage(vector.data.datapath) - vector.split_data(weights, seed) + for i in range(0, len(cargo.vectors)): + vector = cargo.vectors[i] + weights, seed = hermesui._ask_user_for_split_percentage(vector.data.datapath) + vector.split_data(weights, seed) - newState = make_prediction_state - if cargo.verbose: logger.debug("split_data_state -> make_prediction_state") + newState = make_prediction_state + if Globals.verbose: Globals.logger.debug("split_data_state -> make_prediction_state") - return newState, cargo + return newState, cargo def make_prediction_state(cargo): - """Develop model based on the train data and make prediction based on this model. - Returns: calculate_metrics_state as next state - """ + """Develop model based on the train data and make prediction based on this model. + Returns: calculate_metrics_state as next state + """ - if cargo.verbose: logger.debug("In make_prediction_state:") + if Globals.verbose: Globals.logger.debug("In make_prediction_state:") - for i in range(0, len(cargo.vectors)): - for r in cargo.recommenders: - # TODO: implement other implementations, ie. WithTfidf(), etc. - # default is WithoutTfidf() - recommender = recommendergenerator.RecommenderFactory().create_obj_recommender(r, cargo.vectors[i]) - # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithTfidf()) - # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithoutTfidf()) - # etc. - with modules.timer.Timer() as t: - cargo.vectors[i].prediction_vector = recommender.make_prediction() - if cargo.verbose: logger.debug("Making prediction takes %s seconds" % t.secs) + for i in range(0, len(cargo.vectors)): + for r in cargo.recommenders: + # TODO: implement other implementations, ie. WithTfidf(), etc. + # default is WithoutTfidf() + recommender = rg.RecommenderFactory().create_obj_recommender(r, cargo.vectors[i]) + # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithTfidf()) + # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithoutTfidf()) + # etc. + with Timer() as t: + cargo.vectors[i].prediction_vector = recommender.make_prediction() + if Globals.verbose: Globals.logger.debug("Making prediction takes %s seconds" % t.secs) - newState = calculate_metrics_state - if cargo.verbose: logger.debug("make_prediction_state -> calculate_metrics_state") + newState = calculate_metrics_state + if Globals.verbose: Globals.logger.debug("make_prediction_state -> calculate_metrics_state") - return newState, cargo + return newState, cargo def calculate_metrics_state(cargo): - """Test the metrics specified by the user. This is an end state. - Returns: None because this is the last state. - """ - - if cargo.verbose: logger.debug("In calculate_metrics_state:") - - # create a metric executor - executor = metricgenerator.MetricExecutor(metricgenerator.Metric()) - - # TODO: figure out why logger prints INFO twice - for i in range(0, len(cargo.vectors)): - logger.info("-" * 80) - logger.info("Data: %s" % cargo.vectors[i].data.datapath) - for m in cargo.metrics: - # check if metric exists - metric = metricgenerator.MetricFactory().create_obj_metric(m) - # set metric in executor - executor.change_metric(metric) - # execute the metric - with modules.timer.Timer() as t: - logger.info("Metric: %s = %f" % (m, executor.execute(cargo.vectors[i]))) - if cargo.verbose: logger.debug("Calculating metric takes %s seconds" % t.secs) - logger.info("-" * 80) - if cargo.verbose: logger.debug("calculate_metrics_state -> end_state") - - return + """Test the metrics specified by the user. This is an end state. + Returns: None because this is the last state + """ + + if Globals.verbose: Globals.logger.debug("In calculate_metrics_state:") + + # create a metric executor + executor = mg.MetricExecutor(mg.Metric()) + + # TODO: figure out why logger prints INFO twice + for i in range(0, len(cargo.vectors)): + Globals.logger.info("-" * 80) + Globals.logger.info("Data: %s" % cargo.vectors[i].data.datapath) + for m in cargo.metrics: + # check if metric exists + metric = mg.MetricFactory().create_obj_metric(m) + # set metric in executor + executor.change_metric(metric) + # execute the metric + with Timer() as t: + Globals.logger.info("Metric: %s = %f" % (m, executor.execute(cargo.vectors[i]))) + if Globals.verbose: Globals.logger.debug("Calculating metric takes %s seconds" % t.secs) + Globals.logger.info("-" * 80) + if Globals.verbose: Globals.logger.debug("calculate_metrics_state -> end_state") + + return def error_state(cargo): - """Error state. Print out the error messages. This is an end state. - Returns: None because this is the last state. - """ - if cargo.verbose: logger.debug("In error_state:") - logger.error("ERROR: " + cargo.error_msg) - if cargo.verbose: logger.debug("error_state -> end_state") - return + """Error state. Print out the error messages. This is an end state. + Returns: None because this is the last state + """ + if Globals.verbose: Globals.logger.debug("In error_state:") + Globals.logger.error("ERROR: " + cargo.error_msg) + if Globals.verbose: Globals.logger.debug("error_state -> end_state") + return diff --git a/hermes/hermesctl.py b/hermes/hermesctl.py index cb41a66..521e0dd 100644 --- a/hermes/hermesctl.py +++ b/hermes/hermesctl.py @@ -1,4 +1,4 @@ -"""Script to run hermes via command line.""" +"""Hermes's entry point""" import click import ConfigParser @@ -9,245 +9,309 @@ from pyspark import SparkConf import hermes -import modules.config as Config -from modules.data import UserVectorData, ContentVectorData +import modules.config as config + from modules.cargo import Cargo +from modules.data import UserVectorData, ContentVectorData +from modules.globals import Globals from modules.singleton import SCSingleton from modules.statemachine import StateMachine -def add_states(stateMachine): - """ json_to_rdd -> split_data - (Collaborative Filtering) -> develop_model -> calculate_metrics - - (Content Based) -> ??? - """ - stateMachine.add_state(hermes.start_state) - stateMachine.add_state(hermes.json_to_rdd_state) - stateMachine.add_state(hermes.split_data_state) - stateMachine.add_state(hermes.make_prediction_state) - stateMachine.add_state(hermes.calculate_metrics_state, isEndState=1) - stateMachine.add_state(hermes.error_state, isEndState=1) - stateMachine.set_start(hermes.start_state) - return +def add_states(state_machine): + """ Add states to the given state machine. + + The current implemented state machine follows this path: + json_to_rdd -> split_data -> make_prediction -> calculate_metrics + + Args: + state_machine: state machine + """ + state_machine.add_state(hermes.start_state) + state_machine.add_state(hermes.json_to_rdd_state) + state_machine.add_state(hermes.split_data_state) + state_machine.add_state(hermes.make_prediction_state) + state_machine.add_state(hermes.calculate_metrics_state, isEndState=True) + state_machine.add_state(hermes.error_state, isEndState=True) + state_machine.set_start(hermes.start_state) + return def create_logger(name): - logger = logging.getLogger(name) - logger.setLevel(logging.DEBUG) - # create hermes.log file that prints out debug messages - fh = logging.FileHandler("hermes.log") - fh.setLevel(logging.DEBUG) - # create console handler for stderr that prints out error messages - che = logging.StreamHandler() - che.setLevel(logging.ERROR) - # create console handler for stdout for info, debug, and error level - choi = logging.StreamHandler(sys.stdout) - choi.setLevel(logging.INFO) - chod = logging.StreamHandler(sys.stdout) - chod.setLevel(logging.DEBUG) - choe = logging.StreamHandler(sys.stdout) - choe.setLevel(logging.ERROR) - # create formatter and add it to the handlers - formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") - fh.setFormatter(formatter) - che.setFormatter(formatter) - choi.setFormatter(formatter) - chod.setFormatter(formatter) - choe.setFormatter(formatter) - # add handlers to logger - logger.addHandler(fh) - logger.addHandler(che) - logger.addHandler(choi) - logger.addHandler(chod) - logger.addHandler(choe) - return logger + """ Create logger with the given name if it's not already created. + + Args: + name: name of logger + Returns: + logger + """ + logger = logging.getLogger(name) + + # check if logger is already created; if not, create it + if not logger.handlers: + logger.setLevel(logging.DEBUG) + # create hermes.log file that prints out debug messages + fh = logging.FileHandler("hermes.log") + fh.setLevel(logging.DEBUG) + # create console handler for stderr that prints out error messages + che = logging.StreamHandler() + che.setLevel(logging.ERROR) + # create console handler for stdout for info, debug, and error level + choi = logging.StreamHandler(sys.stdout) + choi.setLevel(logging.INFO) + chod = logging.StreamHandler(sys.stdout) + chod.setLevel(logging.DEBUG) + choe = logging.StreamHandler(sys.stdout) + choe.setLevel(logging.ERROR) + # create formatter and add it to the handlers + formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + fh.setFormatter(formatter) + che.setFormatter(formatter) + choi.setFormatter(formatter) + chod.setFormatter(formatter) + choe.setFormatter(formatter) + # add handlers to logger + logger.addHandler(fh) + logger.addHandler(che) + logger.addHandler(choi) + logger.addHandler(chod) + logger.addHandler(choe) + + return logger def create_sparkcontext(): - conf = SparkConf().setAppName("hermes") - return SCSingleton(conf) - -def extract_configs(configs_path, list_of_files_config_path, cargo, logger): - # TODO: is there a better way to implement this function? - - # extract list_of_files_config - lofcp = ConfigParser.ConfigParser() - lofcp.read(list_of_files_config_path) - - # helper functions for extracting configs - def handle_recognized_section_item(section, item_key, item_value): - if section == "datasets": - datasets_items[item_key] = item_value - return - if section == "recommenders": - if item_key == "recommenders": - cargo.recommenders.extend( json.loads(item_value) ) - return - if section == "metrics": - if item_key == "metrics": - cargo.metrics.extend( json.loads(item_value) ) - - def handle_unrecognized_section_item(section, item_key, item_value): - if section == "datasets": - # add support file - cargo.support_files[item_key] = item_value - return - if section == "recommenders": - logger.error("ERROR: skip unrecognized item " + item_key + " under section [" + section + "] in config" + config_path) - return - if section == "metrics": - logger.error("ERROR: skip unrecognized item " + item_key + " under section [" + section + "] in config" + config_path) - return - - def handle_dataset_section(dataset_items, config_path): - # make sure vectorizer is initialized in order to verify the section in list_of_files_config - # TODO: which is better? iterating through sections then items or iterating through just items of list_of_files_config? - - if not ("vectorizer" in datasets_items.keys()): - logger.error("ERROR: config " + config_path + " must have vectorizer specified.") - sys.exit() - - vectorizer = datasets_items["vectorizer"] - lofmap = Config.map_section(lofcp, vectorizer) - - # create UserVectorData or ContentVectorData or both - hasUserVector = False - # check it has the required items to build a UserVectorData - if set(Config.REQ_UV_HEADINGS) < set(datasets_items.keys()): - hasUserVector = True - create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector=True) - - hasContentVector = False - # check it has the required items to build a ContentVectorData - if set(Config.REQ_CV_HEADINGS) < set(datasets_items.keys()): - hasContentVector = True - create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector=False) - - if not hasUserVector and not hasContentVector: - logger.error("ERROR: config " + config_path + " does not have declaration for a user vector or a content vector") - sys.exit() - - def create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector): - """ - user_vector_data = movielens_10m_ratings, bleh_ratings - user_vector_schemas = movielens_10m_ratings_schema, bleh_schema - user_vector_types = ratings, bleh - - """ - - if isUserVector: - datapaths_heading = "user_vector_data" - vector_types_heading = "user_vector_types" - schemapaths_heading = "user_vector_schemas" - else: - datapaths_heading = "content_vector_data" - vector_types_heading = "content_vector_types" - schemapaths_heading = "content_vector_schemas" - - datapaths = json.loads(datasets_items[datapaths_heading]) - vector_types = json.loads(datasets_items[vector_types_heading]) - hasSchemas = False - if "user_vector_schemas" in datasets_items.keys(): - schemapaths = json.loads(datasets_items[schemapaths_heading]) - hasSchemas = True - - # check that a vector type is specified for each data - # TODO: multiple vector types for each data in the future? - if len(datapaths) != len(vector_types): - logger.error("ERROR: must specify a vector type for each data in config " + config_path) - sys.exit() - - for i in range(0, len(datapaths)): - # set datapath - try: - datapath = lofmap[datapaths[i]] - except KeyError: - logger.error("ERROR: cannot find data " + datapath + " in the list_of_files_config for config " + config_path) - sys.exit() - # set vector_type - vector_type = vector_types[i] - # set schemapath - try: - if hasSchemas: schemapath = lofmap[schemapaths[i]] - except IndexError, KeyError: - schemapath = None - - if isUserVector: - uservectordata = UserVectorData(datapath, vector_type, schemapath, vectorizer) - cargo.datas.append(uservectordata) - else: - contentvectordata = ContentVectorData(datapath, vector_type, schemapath, vectorizer) - cargo.datas.append(contentvectordata) - - # extract configs - for config_path in configs_path: - cp = ConfigParser.ConfigParser() - cp.read(config_path) - datasets_items = {} - # extract sections - for section in cp.sections(): - if section in Config.HEADINGS.keys(): - # extract section's items - for (item_key, item_value) in cp.items(section): - if item_key in Config.HEADINGS.get(section): - handle_recognized_section_item(section, item_key, item_value) - else: - handle_unrecognized_section_item(section, item_key, item_value) - # end extract item - else: - logger.error("ERROR: skip unrecognized section heading [" + section + "] in config " + config_path) - # handle "datasets" section - if section == "datasets": - handle_dataset_section(datasets_items, config_path) - # end extract sections - # end extract configs + """ Create a single Spark Context with the app name hermes. + + Returns: + SCSingleton: wrapper object that prevents multiple instantiation of the spark context + + """ + conf = SparkConf().setAppName("hermes") + return SCSingleton(conf) + + +# TODO: is there a better way to implement this function? +def extract_configs(configs_path, list_of_files_config_path, cargo): + """ Extract configuration files and store the configurations in cargo. + + Args: + configs_path: list of paths to configuration files + list_of_files_config_path: path to list of files configuration file + cargo: object passed in state machine + + """ + + # extract list_of_files_config + lofcp = ConfigParser.ConfigParser() + lofcp.read(list_of_files_config_path) + + def handle_recognized_section_item(section, item_key, item_value): + """ Helper function that extracts recognized section items. """ + if section == "datasets": + datasets_items[item_key] = item_value + # [datasets] items will be placed into cargo in handle_dataset_section() + return + if section == "recommenders": + if item_key == "recommenders": + # add list of recommenders into cargo + cargo.recommenders.extend( json.loads(item_value) ) + return + if section == "metrics": + if item_key == "metrics": + # add list of metrics into cargo + cargo.metrics.extend( json.loads(item_value) ) + return + + def handle_unrecognized_section_item(section, item_key, item_value): + """ Helper function that extracts unrecognized section items. """ + if section == "datasets": + # any unrecognized [datasets] items will be placed in cargo's support_files dictionary + cargo.support_files[item_key] = item_value + return + if section == "recommenders": + Globals.logger.error("ERROR: skip unrecognized item " + item_key + " under section [" + section + "] in config" + config_path) + return + if section == "metrics": + Globals.logger.error("ERROR: skip unrecognized item " + item_key + " under section [" + section + "] in config" + config_path) + return + + def handle_dataset_section(dataset_items, config_path): + """ Helper function that handles [datasets] section. """ + # TODO: which is better? iterating through sections then items or iterating through just items of list_of_files_config? + + # make sure vectorizer is initialized in order to verify the section in list_of_files_config + if not ("vectorizer" in datasets_items.keys()): + Globals.logger.error("ERROR: config " + config_path + " must have vectorizer specified.") + sys.exit() + + vectorizer = datasets_items["vectorizer"] + lofmap = config.map_section(lofcp, vectorizer) + + # create UserVectorData or ContentVectorData or both + hasUserVector = False + # check it has the required items to build a UserVectorData + if set(config.REQ_UV_HEADINGS) < set(datasets_items.keys()): + hasUserVector = True + create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector=True) + + hasContentVector = False + # check it has the required items to build a ContentVectorData + if set(config.REQ_CV_HEADINGS) < set(datasets_items.keys()): + hasContentVector = True + create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector=False) + + if not hasUserVector and not hasContentVector: + Globals.logger.error("ERROR: config " + config_path + " does not have declaration for a user vector or a content vector") + sys.exit() + + def create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector): + """ Helper function that creates a UserVectorData or ContentVectorData depending if it isUserVector or not. + + Storing configuration for UserVector or ContentVector in an object (like UserVectorData and ContentVectorData) + is easier than storing its individual parts. UserVectorData and ContentVectorData will be added into cargo in + cargo's data list. + """ + + if isUserVector: + datapaths_heading = "user_vector_data" + vector_transformations_heading = "user_vector_transformations" + schemapaths_heading = "user_vector_schemas" + else: + datapaths_heading = "content_vector_data" + vector_transformations_heading = "content_vector_transformations" + schemapaths_heading = "content_vector_schemas" + + datapaths = json.loads(datasets_items[datapaths_heading]) + vector_transformations = json.loads(datasets_items[vector_transformations_heading]) + hasSchemas = False + if schemapaths_heading in datasets_items.keys(): + schemapaths = json.loads(datasets_items[schemapaths_heading]) + hasSchemas = True + + # check that a vector transformation is specified for each data + # TODO: multiple vector trasnformation for each data in the future? + if len(datapaths) != len(vector_transformations): + Globals.logger.error("ERROR: must specify a vector type for each data in config " + config_path) + sys.exit() + + for i in range(0, len(datapaths)): + # set datapath + try: + datapath = lofmap[datapaths[i]] + except KeyError: + Globals.logger.error("ERROR: cannot find data " + datapath + " in the list_of_files_config for config " + config_path) + sys.exit() + # set vector_transformation + vector_transformation = vector_transformations[i] + # set schemapath + try: + if hasSchemas: schemapath = lofmap[schemapaths[i]] + except IndexError, KeyError: + schemapath = None + + if isUserVector: + uservectordata = UserVectorData(datapath, vector_transformation, schemapath, vectorizer) + cargo.datas.append(uservectordata) + else: + contentvectordata = ContentVectorData(datapath, vector_transformation, schemapath, vectorizer) + cargo.datas.append(contentvectordata) + + # extract configs + for config_path in configs_path: + cp = ConfigParser.ConfigParser() + cp.read(config_path) + datasets_items = {} + # extract sections + for section in cp.sections(): + if section in config.HEADINGS.keys(): + # extract section's items + for (item_key, item_value) in cp.items(section): + if item_key in config.HEADINGS.get(section): + handle_recognized_section_item(section, item_key, item_value) + else: + handle_unrecognized_section_item(section, item_key, item_value) + # end extract items + else: + Globals.logger.error("ERROR: skip unrecognized section heading [" + section + "] in config " + config_path) + # handle [datasets] section + if section == "datasets": + handle_dataset_section(datasets_items, config_path) + # end extract sections + # end extract configs def print_version(ctx, param, value): - """Print the current version of hermes and exit.""" - if not value: - return - import pkg_resources - version = None - try: - version = pkg_resources.get_distribution("hermes").version - finally: - del pkg_resources - click.echo(version) - ctx.exit() - -# TODO: add option to print what recommenders + """Print the current version of hermes and exit.""" + if not value: + return + import pkg_resources + version = None + try: + version = pkg_resources.get_distribution("hermes").version + finally: + del pkg_resources + click.echo(version) + ctx.exit() + +def print_data(ctx, param, value): + """Print a list of data currently supported and exit.""" + if not value: + return + click.echo("This option is not yet implemented.") + ctx.exit() + +def print_recommenders(ctx, param, value): + """Print a list of recommender system algorithms currently supported and exit.""" + if not value: + return + click.echo("This option is not yet implemented.") + ctx.exit() + +def print_metrics(ctx, param, value): + """Print a list of metrics currently supported and exit.""" + if not value: + return + click.echo("This option is not yet implemented.") + ctx.exit() + +# TODO: implement print_data, print_recommenders, print_metrics? @click.command() -@click.option("--verbose", is_flag=True, \ - help="Print debug messages") @click.option("--version", callback=print_version, is_flag=True, expose_value=False, is_eager=True, \ - help="Display hermes's version number") + help="Display hermes's version number.") +@click.option("--data", callback=print_data, is_flag=True, expose_value=False, is_eager=True, \ + help="Print a list of data currently supported.") +@click.option("--algos", callback=print_recommenders, is_flag=True, expose_value=False, is_eager=True, \ + help="Print a list of recommender system algorithms currently supported.") +@click.option("--metrics", callback=print_metrics, is_flag=True, expose_value=False, is_eager=True, \ + help="Print a list of metrics currently supported.") +@click.option("--verbose", is_flag=True, \ + help="Print debug messages") @click.option("--hdfs_dir", default="datasets", \ - help="Name of HDFS directory to store input data.") + help="Name of HDFS directory to store input data. Default = datasets.") # IP address of fs.default.name used in HDFS @click.argument("fs_default_ip_addr", default="localhost:9000") @click.argument("list_of_files_config", type=click.Path(exists=True), nargs=1) @click.argument("configs", type=click.Path(exists=True), nargs=-1) def main(verbose, hdfs_dir, fs_default_ip_addr, list_of_files_config, configs): - """Hermes allows you to run multiple recommender system metrics on your chosen dataset.""" - - # create logger - logger = create_logger("hermes") - # create state machine - stateMachine = StateMachine() - add_states(stateMachine) + # initialize global variables + Globals.verbose = verbose + Globals.logger = create_logger("hermes") + Globals.scsingleton = create_sparkcontext() - # create cargo - cargo = Cargo() + # create state machine + state_machine = StateMachine() + add_states(state_machine) - # add items to cargo - cargo.scsingleton = create_sparkcontext() - cargo.verbose = verbose - cargo.hdfs_dir = hdfs_dir - cargo.fs_default_ip_addr = fs_default_ip_addr + # create cargo + cargo = Cargo() - # extract configs and add them to cargo - extract_configs(configs, list_of_files_config, cargo, logger) + # add items to cargo + cargo.hdfs_dir = hdfs_dir + cargo.fs_default_ip_addr = fs_default_ip_addr + # extract configs and add them to cargo + extract_configs(configs, list_of_files_config, cargo) - # run state machine - stateMachine.run(cargo) - + # run state machine + stateMachine.run(cargo) + diff --git a/hermes/hermesui.py b/hermes/hermesui.py index 89c372b..3301330 100644 --- a/hermes/hermesui.py +++ b/hermes/hermesui.py @@ -1,135 +1,137 @@ +"""Hermes's user interface via the command line""" + def _ask_user_for_rdd_format(schema_path, schema_names): - """Ask user for the desired RDD format. - Args: - schema_path: the path to the schema file - schema_names: - Returns: List of schema_name's id. - """ - print "How do you want your data to be parsed?" - print "For example: Given the following options" - print "(0) movie_id" - print "(1) rating" - print "(2) timestamp" - print "(3) user_id" - print "if you wanted the data to be parsed in the format of [(user_id, movie_id, rating)]," - print "please type in: 3 0 1\n" - - def _check_schema_ids(schema_ids, num_schema_ids): - - # check if each schema_name_id is in the range of num_schema_ids - for schema_name_id in schema_name_ids: - if schema_name_id not in range(0, num_schema_ids): - print "Option provided is not in range." - return False - - # check that there are no duplicates - if len(schema_name_ids) != len(set(schema_name_ids)): - print "There are duplicates. Please provide no duplicates." - return False - - return True - - - print "For the following given schema %s" % (schema_path) - print "how do you want your data to be parsed? " - for i in range(0, len(schema_names)): - print "(%s) %s" % (i, schema_names[i]) - - while True: - user_input = raw_input("Enter the numbers separated by blank space: ") - try: - schema_name_ids = [int(schema_name_id.strip()) for schema_name_id in user_input.split(" ")] - if _check_schema_ids(schema_name_ids, len(schema_names)): - break - except ValueError: - print "Please provide a valid number." - - return schema_name_ids + """Ask user for the desired RDD format. + Args: + schema_path: the path to the schema file + schema_names: + Returns: List of schema_name's id. + """ + print "How do you want your data to be parsed?" + print "For example: Given the following options" + print "(0) movie_id" + print "(1) rating" + print "(2) timestamp" + print "(3) user_id" + print "if you wanted the data to be parsed in the format of [(user_id, movie_id, rating)]," + print "please type in: 3 0 1\n" + + def _check_schema_ids(schema_ids, num_schema_ids): + + # check if each schema_name_id is in the range of num_schema_ids + for schema_name_id in schema_name_ids: + if schema_name_id not in range(0, num_schema_ids): + print "Option provided is not in range." + return False + + # check that there are no duplicates + if len(schema_name_ids) != len(set(schema_name_ids)): + print "There are duplicates. Please provide no duplicates." + return False + + return True + + + print "For the following given schema %s" % (schema_path) + print "how do you want your data to be parsed? " + for i in range(0, len(schema_names)): + print "(%s) %s" % (i, schema_names[i]) + + while True: + user_input = raw_input("Enter the numbers separated by blank space: ") + try: + schema_name_ids = [int(schema_name_id.strip()) for schema_name_id in user_input.split(" ")] + if _check_schema_ids(schema_name_ids, len(schema_names)): + break + except ValueError: + print "Please provide a valid number." + + return schema_name_ids def _ask_user_for_split_percentage(datum_json_path): - """Ask user what percentage to split the data into training, test, and validation. - Args: - datum_json_path: the path to the data JSON file - Returns: Tuple of percentage of training, test, and validation respectively in float notation. - (trainingPercentage, testPercentage, validationPercentage), seed - """ - print "How do you want to split your data?" - print "For example: If you wanted to split the data into " - print "60% training, 40% test, 0% validation, seed = 11, please type in:" - print "Percentage for training: 60" - print "Percentage for test: 40" - print "Percentage for validation: 0" - print "Seed: 11\n" - - - def _check_percentage(percentage): - """Check if the percentage is valid. - """ - if percentage in range(0, 100): - return True - else: - return False - - def _check_sum_percentage(a, b, c): - """Check if the sum of the given percentages is equal to 100. - """ - sum_percentage = a + b + c - if sum_percentage == 100: - return True - else: - return False - - print "For the following given data %s" % (datum_json_path) - print "how do you want to split your data?" - while True: - while True: - try: - trainingPercentage = int(raw_input("Percentage for training: ").strip()) - except ValueError: - print "Please provide a valid number." - else: - if _check_percentage(trainingPercentage): - break - else: - print "Please provide a number from 0 - 100." - while True: - try: - testPercentage = int(raw_input("Percentage for test: ").strip()) - except ValueError: - print "Please provide a valid number." - else: - if _check_percentage(testPercentage): - break - else: - print "Please provide a number from 0 - 100." - while True: - try: - validationPercentage = int(raw_input("Percentage for validation: ").strip()) - except ValueError: - print "Please provide a valid number." - else: - if _check_percentage(validationPercentage): - break - else: - print "Please provide a number from 0 - 100." - if _check_sum_percentage(trainingPercentage, testPercentage, validationPercentage): - break - else: - print "Sum of percentages does not equal to 100. Please re-input the percentages." - - while True: - try: - seed = int(raw_input("Seed: ").strip()) - break - except ValueError: - print "Please provide a valid number." - - # convert it to a percentage from 0 - 1 - trainingPercentage = trainingPercentage/100. - testPercentage = testPercentage/100. - validationPercentage = validationPercentage/100. - - return [trainingPercentage, testPercentage, validationPercentage], seed + """Ask user what percentage to split the data into training, test, and validation. + Args: + datum_json_path: the path to the data JSON file + Returns: Tuple of percentage of training, test, and validation respectively in float notation. + (trainingPercentage, testPercentage, validationPercentage), seed + """ + print "How do you want to split your data?" + print "For example: If you wanted to split the data into " + print "60% training, 40% test, 0% validation, seed = 11, please type in:" + print "Percentage for training: 60" + print "Percentage for test: 40" + print "Percentage for validation: 0" + print "Seed: 11\n" + + + def _check_percentage(percentage): + """Check if the percentage is valid. + """ + if percentage in range(0, 100): + return True + else: + return False + + def _check_sum_percentage(a, b, c): + """Check if the sum of the given percentages is equal to 100. + """ + sum_percentage = a + b + c + if sum_percentage == 100: + return True + else: + return False + + print "For the following given data %s" % (datum_json_path) + print "how do you want to split your data?" + while True: + while True: + try: + trainingPercentage = int(raw_input("Percentage for training: ").strip()) + except ValueError: + print "Please provide a valid number." + else: + if _check_percentage(trainingPercentage): + break + else: + print "Please provide a number from 0 - 100." + while True: + try: + testPercentage = int(raw_input("Percentage for test: ").strip()) + except ValueError: + print "Please provide a valid number." + else: + if _check_percentage(testPercentage): + break + else: + print "Please provide a number from 0 - 100." + while True: + try: + validationPercentage = int(raw_input("Percentage for validation: ").strip()) + except ValueError: + print "Please provide a valid number." + else: + if _check_percentage(validationPercentage): + break + else: + print "Please provide a number from 0 - 100." + if _check_sum_percentage(trainingPercentage, testPercentage, validationPercentage): + break + else: + print "Sum of percentages does not equal to 100. Please re-input the percentages." + + while True: + try: + seed = int(raw_input("Seed: ").strip()) + break + except ValueError: + print "Please provide a valid number." + + # convert it to a percentage from 0 - 1 + trainingPercentage = trainingPercentage/100. + testPercentage = testPercentage/100. + validationPercentage = validationPercentage/100. + + return [trainingPercentage, testPercentage, validationPercentage], seed diff --git a/hermes/modules/cargo.py b/hermes/modules/cargo.py index b07171d..86c9af1 100644 --- a/hermes/modules/cargo.py +++ b/hermes/modules/cargo.py @@ -1,23 +1,28 @@ class Cargo(object): - """Cargo contains objects that are passed around in the state machine. + """Cargo is the object passed around in the state machine. + It encapsulates all the parameters needed for each state in one object. - Args: - scsingleton: Spark Context. There can only be one scsingleton running. - verbose: a boolean variable that prints out log messages - hdfs_dir: - fs_default_ip_addr: - error_msg: - """ - # TODO: implement cargo as object pool model? - def __init__(self): - self.scsingleton = None - self.verbose = False - self.hdfs_dir = None - self.fs_default_ip_addr = None - self.error_msg = "" - self.datas = [] # used until json_to_rdd_state - self.vectors = [] # used until develop_model_state - self.support_files = {} - self.recommenders = [] - self.metrics = [] + * hdfs_dir: Name of HDFS directory to store input data. + One of the option passed in when running hermes binary. + Default = datasets. + * fs_default_ip_addr: IP address of fs.default.name used in HDFS. + One of the arguments passed in when running hermes binary. + Default = localhost:9000. + * datas: List of Data objects initialized when extracting the configuration file. + * vectors: List of Vector objects initialized during one of the states in the state machine, json_to_rdd_state. + * support_files: Unrecognized items in [datasets] section of the configuration file that is presumed to be support files for the creation of a Vector. + * recommenders: List of recommender system algorithms initialized when extracting the configuration file. + * metrics: List of metrics initialized when extracting the configuration file. + * error_msg: It starts out as an empty string that will be initialized as an error message to the error state. + """ + # TODO: implement cargo as object pool model? + def __init__(self): + self.hdfs_dir = None + self.fs_default_ip_addr = None + self.datas = [] # used until json_to_rdd_state + self.vectors = [] # used until develop_model_state + self.support_files = {} + self.recommenders = [] + self.metrics = [] + self.error_msg = "" diff --git a/hermes/modules/config.py b/hermes/modules/config.py index 2d34b39..3de0400 100644 --- a/hermes/modules/config.py +++ b/hermes/modules/config.py @@ -1,31 +1,37 @@ -import logging - -REQ_UV_HEADINGS = ("user_vector_data", "user_vector_types") +# recognized sections and their items +REQ_UV_HEADINGS = ("user_vector_data", "user_vector_transformations") UV_HEADINGS = () + REQ_UV_HEADINGS + ("user_vector_schemas",) -REQ_CV_HEADINGS = ("content_vector_data", "content_vector_types") +REQ_CV_HEADINGS = ("content_vector_data", "content_vector_transformations") CV_HEADINGS = () + REQ_CV_HEADINGS + ("content_vector_schemas",) DATASETS_HEADINGS = ("vectorizer",) + UV_HEADINGS + CV_HEADINGS HEADINGS = { "datasets": DATASETS_HEADINGS, \ - "recommenders": ("recommenders"), \ - "metrics": ("metrics") \ - } - -# get logger -logger = logging.getLogger("hermes") + "recommenders": ("recommenders"), \ + "metrics": ("metrics") \ + } def map_section(config_parser, section): - global logger - section_dict = {} - options = config_parser.options(section) - for option in options: - try: - section_dict[option] = config_parser.get(section, option) - if section_dict[option] == -1: - logger.debug(__name__ + ": map_section(): skipping option " + option) - except: - logger.error(__name__ + ": map_section(): exception on option " + option) - section_dict[option] = None - return section_dict + """ Map a section with the given section name and return a dictionary of the section. + + Args: + config_parser: config parser of the configuration file + section: section name to map + + Returns: + section_dict: a dictionary of the section. + Use section_dict to obtain the value of the item provided that you know the item name, ie. section_dict[item_name]. + """ + + section_dict = {} + options = config_parser.options(section) + for option in options: + try: + section_dict[option] = config_parser.get(section, option) + if section_dict[option] == -1: + Globals.logger.debug(__name__ + ": map_section(): skipping option " + option) + except: + Globals.logger.error(__name__ + ": map_section(): exception on option " + option) + section_dict[option] = None + return section_dict diff --git a/hermes/modules/data.py b/hermes/modules/data.py index a504fff..5b74d3b 100644 --- a/hermes/modules/data.py +++ b/hermes/modules/data.py @@ -1,36 +1,39 @@ import helper -import vectorgenerator +import vectorgenerator # TODO: avoid this? +# TODO: a better way of storing configuration from configuration file? class Data(object): - - def __init__(self, datapath, vector_type, schemapath, vectorizer): - if helper.is_filepath_valid(datapath): - self.datapath = datapath - self.vector_type = vector_type - self.schema = helper.get_schema(schemapath) - self.dataframe = None - self.vectorizer = vectorizer - # TODO: do we need to know from which config the data is from? - - def set_dataframe(self, scsingleton, datapath_in_hdfs): - self.dataframe = scsingleton.sqlCtx.read.json(datapath_in_hdfs, self.schema) - # explicitly repartition RDD after loading so that more tasks can run on it in parallel - # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster - self.dataframe = self.dataframe.repartition(scsingleton.sc.defaultParallelism * 3) - - # set schema if it is not already set - if self.schema is None: - self.schema = self.dataframe.schema + """ Store configuration from configuration files. """ + + def __init__(self, datapath, vector_transformation, schemapath, vectorizer): + if helper.is_filepath_valid(datapath): + self.datapath = datapath + self.vectorizer = vectorizer + self.vector_transformation = vector_transformation + self.schema = helper.get_schema(schemapath) + self.dataframe = None + # TODO: do we need to know from which config the data is from? + + def set_dataframe(self, scsingleton, datapath_in_hdfs): + self.dataframe = scsingleton.sqlCtx.read.json(datapath_in_hdfs, self.schema) + # explicitly repartition RDD after loading so that more tasks can run on it in parallel + # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster + # TODO: a better way to go about the dataframe repartition? + self.dataframe = self.dataframe.repartition(scsingleton.sc.defaultParallelism * 3) + + # set schema if it is not already set + if self.schema is None: + self.schema = self.dataframe.schema class UserVectorData(Data): - def __init__(self, datapath, vector_type, schemapath, vectorizer): - super(self.__class__, self).__init__(datapath, vector_type, schemapath, vectorizer) - self.which_vector = vectorgenerator.UserVector + def __init__(self, datapath, vector_transformation, schemapath, vectorizer): + super(self.__class__, self).__init__(datapath, vector_transformation, schemapath, vectorizer) + self.which_vector = vectorgenerator.UserVector class ContentVectorData(Data): - def __init__(self, datapath, vector_type, schemapath, vectorizer): - super(self.__class__, self).__init__(datapath, vector_type, schemapath, vectorizer) - self.which_vector = vectorgenerator.ContentVector + def __init__(self, datapath, vector_transformation, schemapath, vectorizer): + super(self.__class__, self).__init__(datapath, vector_transformation, schemapath, vectorizer) + self.which_vector = vectorgenerator.ContentVector diff --git a/hermes/modules/globals.py b/hermes/modules/globals.py new file mode 100644 index 0000000..c05d95e --- /dev/null +++ b/hermes/modules/globals.py @@ -0,0 +1,11 @@ +class Globals(object): + """Globals contains global variables shared by all files. + + Args: + verbose: a boolean variable that prints out debug log messages + logger: logging object that logs messages + scsingleton: Spark Context. There can only be one scsingleton running. + """ + verbose = False + logger = None + scsingleton = None diff --git a/hermes/modules/helper.py b/hermes/modules/helper.py index c3aa2ec..f8dcf67 100644 --- a/hermes/modules/helper.py +++ b/hermes/modules/helper.py @@ -1,13 +1,15 @@ +"""Global helper functions""" + import os import json from pyspark.sql.types import StructType def is_filepath_valid(filepath): - return True if os.path.isfile(filepath) else False + return True if os.path.isfile(filepath) else False def get_schema(schema_path): - if not schema_path: - return None - with open(schema_path, "r") as schema_file: - return StructType.fromJson(json.load(schema_file)) + if not schema_path: + return None + with open(schema_path, "r") as schema_file: + return StructType.fromJson(json.load(schema_file)) diff --git a/hermes/modules/metricgenerator.py b/hermes/modules/metricgenerator.py index 37105e7..879df20 100644 --- a/hermes/modules/metricgenerator.py +++ b/hermes/modules/metricgenerator.py @@ -8,7 +8,7 @@ eggsecutor = MetricExecutor(RMSE()) print eggsecutor.execute(vector) -eggsecutor.changeAlgorithm(PRFS()) +eggsecutor.change_metric(PRFS()) print eggsecutor.execute(vector) """ @@ -32,29 +32,29 @@ def change_metric(self, new_metric): # ================================================================================ class MetricFactory(object): - def create_obj_metric(self, metric_str): - which_metric = getattr(sys.modules[__name__], metric_str) - if not which_metric: - # cannot find class - raise ValueError - else: - return which_metric() + def create_obj_metric(self, metric_str): + which_metric = getattr(sys.modules[__name__], metric_str) + if not which_metric: + # cannot find class + raise ValueError + else: + return which_metric() class Metric: def calculate_metric(self, vector=None) : - pass + pass class RMSE(Metric): def calculate_metric(self, vector): - return pm.calculate_rmse(vector.test_vector, vector.prediction_vector) + return pm.calculate_rmse(vector.test_vector, vector.prediction_vector) class MAE(Metric): - def calculate_metric(self, vector): - return pm.calculate_mae(vector.test_vector, vector.prediction_vector) + def calculate_metric(self, vector): + return pm.calculate_mae(vector.test_vector, vector.prediction_vector) class PRFS(Metric): def calculate_metric(self): - pass + raise NotImplemented diff --git a/hermes/modules/recommendergenerator.py b/hermes/modules/recommendergenerator.py index a086058..8943999 100644 --- a/hermes/modules/recommendergenerator.py +++ b/hermes/modules/recommendergenerator.py @@ -6,13 +6,13 @@ recommender = ALS(with_tfidf) recommender.make_prediction() -recommender = ALS(without_tfdif) +recommender = ALS(without_tfdif) # same as: recommender = ALS() recommender.make_prediction() recommender = CBWithKMeans(with_tfidf) recommender.make_prediction() -recommender = CBWithKMeans(without_tfidf) +recommender = CBWithKMeans(without_tfidf) # same as: recommender = CBWithKMeans recommender.make_prediction() """ @@ -22,81 +22,68 @@ import timer import pyspark.mllib.recommendation as mllib - -# get logger -logger = logging.getLogger("hermes") +from modules.globals import Globals # ================================================================================ # Background implementation interface # ================================================================================ class ImplementationInterface(object): - def make_prediction_with_als(self): - raise NotImplemented + def make_prediction_with_als(self): + raise NotImplemented - def make_prediction_with_cbkmeans(self): - raise NotImplemented + def make_prediction_with_cbkmeans(self): + raise NotImplemented # ================================================================================ # Concrete background implementations # ================================================================================ -# TODO: Interface is not necessary. -# Should we remove ImplementationInterface? Or keep it for design sake? +# TODO: ask Anna for the specifics class WithTfidf(ImplementationInterface): - def make_prediction_with_als(self, vector): - # create ALS model with tf-idf - pass - - def make_prediction_with_cbkmeans(self, vector): - # create CB with K-means with tf-idf - pass + def make_prediction_with_cbkmeans(self, vector): + # create CB with K-means with tf-idf + pass class WithoutTfidf(ImplementationInterface): - def make_prediction_with_als(self, vector): - # create ALS model without tf-idf - # TODO: specify rank based on what the user wants - model = mllib.ALS.train(vector.training_vector, rank=3) - prediction_vector = model.predictAll( vector.test_vector.map( lambda x: (x[0], x[1]) ) ).cache() - return prediction_vector + def make_prediction_with_als(self, vector): + # create ALS model without tf-idf + # TODO: specify rank based on what the user wants + model = mllib.ALS.train(vector.training_vector, rank=3) + prediction_vector = model.predictAll( vector.test_vector.map( lambda x: (x[0], x[1]) ) ).cache() + return prediction_vector - def make_prediction_with_cbkmeans(self, vector): - # create CB with K-means without tf-idf - pass - -# ================================================================================ -# Target Interface -# ================================================================================ - -class AbstractInterface(object): - def make_prediction(self): - raise NotImplemented + def make_prediction_with_cbkmeans(self, vector): + # create CB with K-means without tf-idf + pass # ================================================================================ # Bridge: bridge target interface & background implementation # ================================================================================ -# TODO: Interface is not necessary. -# Should we remove ImplementationInterface? Or keep it for design sake? -class Recommender(AbstractInterface): - def __init__(self, vector): - self.vector = vector - self.implementation = None +class Recommender(object): + def __init__(self, vector, implementation=WithoutTfidf()): + self.vector = vector + self.implementation = implementation + + def make_prediction(self): + # target interface + raise NotImplemented # ================================================================================ # Recommender Factory # ================================================================================ class RecommenderFactory(object): - def create_obj_recommender(self, recommender_str, vector, implementation=WithoutTfidf()): - which_recommender = getattr(sys.modules[__name__], recommender_str) - if not which_recommender: - # cannot find class - raise ValueError - else: - return which_recommender(vector, implementation) + def create_obj_recommender(self, recommender_str, vector, implementation=WithoutTfidf()): + which_recommender = getattr(sys.modules[__name__], recommender_str) + if not which_recommender: + # cannot find class + raise ValueError + else: + return which_recommender(vector, implementation) # ================================================================================ @@ -104,18 +91,10 @@ def create_obj_recommender(self, recommender_str, vector, implementation=Without # ================================================================================ class ALS(Recommender): - def __init__(self, vector, implementation=WithoutTfidf()): - self.vector = vector - self.implementation = implementation - - def make_prediction(self): - return self.implementation.make_prediction_with_als(self.vector) + def make_prediction(self): + return self.implementation.make_prediction_with_als(self.vector) class CBWithKMeans(Recommender): - def __init__(self, vector, implementation=WithoutTfidf()): - self.vector = vector - self.implementation = implementation - - def make_prediction(self): - return self.implementation.make_prediction_with_cbkmeans(self.vector) + def make_prediction(self): + return self.implementation.make_prediction_with_cbkmeans(self.vector) diff --git a/hermes/modules/singleton.py b/hermes/modules/singleton.py index c327406..767e142 100644 --- a/hermes/modules/singleton.py +++ b/hermes/modules/singleton.py @@ -4,6 +4,8 @@ from pyspark.sql import SQLContext class SCSingleton(object): + """ Wrapper for Spark Context to prevent multiple instantiation of the Spark Context. """ + __instance = None def __new__(cls, conf): diff --git a/hermes/modules/statemachine.py b/hermes/modules/statemachine.py index 5afdd90..06a4ada 100644 --- a/hermes/modules/statemachine.py +++ b/hermes/modules/statemachine.py @@ -1,36 +1,59 @@ -class InitializationError(Exception): pass +class InitializationError(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) class StateMachine: - def __init__(self): - self.handlers = [] - self.startState = None - self.endStates = [] - - def add_state(self, handler, isEndState=0): - self.handlers.append(handler) - if isEndState: - self.endStates.append(handler) - - def set_start(self, handler): - self.startState = handler - - def run(self, cargo=None): - if not self.startState: - raise InitializationError("Must call .set_start() before .run()") - if not self.endStates: - raise InitializationError("Must call .set_start() before .run()") - - handler = self.startState - - while True: - (newState, cargo) = handler(cargo) - if newState in self.endStates: - newState(cargo) - break - elif newState not in self.handlers: - print self.handlers - raise RuntimeError("Invalid state %s" % newState) - else: - handler = newState - - return self \ No newline at end of file + """ + To emulate a state machine. + + Example: + # state1 -> state2 -> state3a + -> state3b + # where state1, state2, state3a, and state3b are defined functions. + + import StateMachine + sm = StateMachine() + sm.add_state(state1) + sm.add_state(state2) + sm.add_state(state3a, isEndState=True) + sm.add_state(state3b, isEndState=True) + sm.set_start(state1) + sm.run() + """ + + def __init__(self): + self.handlers = [] + self.startState = None + self.endStates = [] + + def add_state(self, handler, isEndState=False): + self.handlers.append(handler) + if isEndState: + self.endStates.append(handler) + + def set_start(self, handler): + self.startState = handler + + def run(self, cargo=None): + if not self.startState: + raise InitializationError("Must call .set_start() before .run()") + if not self.endStates: + raise InitializationError("Must call .set_start() before .run()") + + handler = self.startState + + while True: + (newState, cargo) = handler(cargo) + if newState in self.endStates: + newState(cargo) + break + elif newState not in self.handlers: + print self.handlers + raise RuntimeError("Invalid state %s" % newState) + else: + handler = newState + + return self \ No newline at end of file diff --git a/hermes/modules/timer.py b/hermes/modules/timer.py index d61bb2d..5927d98 100644 --- a/hermes/modules/timer.py +++ b/hermes/modules/timer.py @@ -1,8 +1,16 @@ import time class Timer(object): - def __init__(self, verbose=False): - self.verbose = verbose + """ + To time how long a particular function runs. + + Example: + import Timer + with Timer() as t: + somefunction() + print("somefunction() takes %s seconds" % t.secs) + print("somefunction() takes %s milliseconds" % t.msecs) + """ def __enter__(self): self.start = time.time() @@ -11,6 +19,4 @@ def __enter__(self): def __exit__(self, *args): self.end = time.time() self.secs = self.end - self.start - self.msecs = self.secs * 1000 - if self.verbose: - print "elapsed time: %f ms" % self.msecs \ No newline at end of file + self.msecs = self.secs * 1000 \ No newline at end of file diff --git a/hermes/modules/vectorgenerator.py b/hermes/modules/vectorgenerator.py index ea09b7a..8214e06 100644 --- a/hermes/modules/vectorgenerator.py +++ b/hermes/modules/vectorgenerator.py @@ -1,4 +1,3 @@ -import data # vector generator == rdd generator @@ -8,140 +7,145 @@ class VectorFactory(object): - def create_vector(self, sqlCtx, data, support_files): - vector = data.which_vector - for cls in vector.__subclasses__(): - if cls.isSameDataInstance(data): - return cls(sqlCtx, data, support_files).vector - else: - # cannot find class that builds the data - raise ValueError - - def create_obj_vector(self, sqlCtx, data, support_files): - vector = data.which_vector - for cls in vector.__subclasses__(): - if cls.isSameDataInstance(data): - return cls(sqlCtx, data, support_files) - else: - # cannot find class that builds the data - raise ValueError + def create_vector(self, sqlCtx, data, support_files): + vector = data.which_vector + for cls in vector.__subclasses__(): + if cls.isSameDataInstance(data): + return cls(sqlCtx, data, support_files).vector + else: + # cannot find class that builds the data + raise ValueError + + def create_obj_vector(self, sqlCtx, data, support_files): + vector = data.which_vector + for cls in vector.__subclasses__(): + if cls.isSameDataInstance(data): + return cls(sqlCtx, data, support_files) + else: + # cannot find class that builds the data + raise ValueError # ================================================================================ # Vector Factory Objects # ================================================================================ class Vector(object): - def __init__(self, sqlCtx, data, support_files): - self.sqlCtx = sqlCtx - self.data = data - self.support_files = support_files - get_vector_type = getattr(self, data.vector_type) - if not get_vector_type: - self.vector = None - else: - self.vector = get_vector_type() - self.training_vector = None - self.test_vector = None - self.validation_vector = None - self.prediction_vector = None - - def split_data(self, weights, seed): - training_vector, test_vector, validation_vector = self.vector.randomSplit(weights, seed) - self.training_vector = training_vector - self.test_vector = test_vector - self.validation_vector = validation_vector + def __init__(self, sqlCtx, data, support_files): + # TODO: remove sqlCtx because it is global? + self.sqlCtx = sqlCtx + self.data = data + self.support_files = support_files + vector_transformation = getattr(self, data.vector_transformation) + if not vector_transformation: + self.vector = None + else: + self.vector = vector_transformation() + self.training_vector = None + self.test_vector = None + self.validation_vector = None + self.prediction_vector = None + + def split_data(self, weights, seed): + training_vector, test_vector, validation_vector = self.vector.randomSplit(weights, seed) + self.training_vector = training_vector + self.test_vector = test_vector + self.validation_vector = validation_vector # ================================================================================ # User Vector and Content Vector Factory Objects # ================================================================================ class UserVector(Vector): - pass + pass class ContentVector(Vector): - pass + pass # ================================================================================ # MovieLens # ================================================================================ +# TODO: separate in its own file +# TODO: do we need isSameDataInstance()? can we eliminate it? class MovieLens(object): - @classmethod - def isSameDataInstance(cls, comparisonData): - return comparisonData.vectorizer == "movielens" + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.vectorizer == "movielens" class MovieLensUserVector(UserVector, MovieLens): - def ratings(self): - return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) + def ratings(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) - def pos_ratings(self): - return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)).filter(lambda (u, m, r): r > 3) + def pos_ratings(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)).filter(lambda (u, m, r): r > 3) - def ratings_to_interact(self): - return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, -1 if row.rating < 3 else 1)) + def ratings_to_interact(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, -1 if row.rating < 3 else 1)) class MovieLensContentVector(ContentVector, MovieLens): - def genre(self): - def genre_vectorizer(row): - return np.array(( - int(row.genre_action), - int(row.genre_adventure), - int(row.genre_animation), - int(row.genre_childrens), - int(row.genre_comedy), - int(row.genre_crime), - int(row.genre_documentary), - int(row.genre_drama), - int(row.genre_fantasy), - int(row.genre_filmnoir), - int(row.genre_horror), - int(row.genre_musical), - int(row.genre_mystery), - int(row.genre_romance), - int(row.genre_scifi), - int(row.genre_thriller), - int(row.genre_war), - int(row.genre_western), - )) - return self.data.dataframe.map(lambda row: (row.movie_id, )) + def genre(self): + def genre_vectorizer(row): + return np.array(( + int(row.genre_action), + int(row.genre_adventure), + int(row.genre_animation), + int(row.genre_childrens), + int(row.genre_comedy), + int(row.genre_crime), + int(row.genre_documentary), + int(row.genre_drama), + int(row.genre_fantasy), + int(row.genre_filmnoir), + int(row.genre_horror), + int(row.genre_musical), + int(row.genre_mystery), + int(row.genre_romance), + int(row.genre_scifi), + int(row.genre_thriller), + int(row.genre_war), + int(row.genre_western), + )) + return self.data.dataframe.map(lambda row: (row.movie_id, genre_vectorizer(row))) # ================================================================================ # Wiki # ================================================================================ +# TODO: separate in its own file class Wiki(object): - @classmethod - def isSameDataInstance(cls, comparisonData): - return comparisonData.vectorizer == "wiki" + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.vectorizer == "wiki" class WikiUserVector(UserVector, Wiki): - def __init__(self): - super(self.__class__, self).__init__() - self.filtered = self.sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") - self.filtered.registerTempTable("wiki_ratings") + def __init__(self): + super(self.__class__, self).__init__() + self.filtered = self.sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") + self.filtered.registerTempTable("wiki_ratings") - def num_edits(self): - return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id") + def num_edits(self): + return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id") - def any_interact(self): - return self.sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings group by user_id, article_id") + def any_interact(self): + return self.sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings group by user_id, article_id") - def num_edits_ceil(self): - return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki group by user_id, article_id")\ - .map(lambda (user, article, rating): (user, article, max(rating, 5))) + def num_edits_ceil(self): + return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id")\ + .map(lambda (user, article, rating): (user, article, max(rating, 5))) class WikiContentVector(ContentVector, Wiki): - def __init__(self): - super(self.__class__, self).__init__() - self.filtered_content = sqlCtx.sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") - self.filtered_content.registerTempTable("wiki_content") + def __init__(self): + super(self.__class__, self).__init__() + self.filtered_content = sqlCtx.sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") + self.filtered_content.registerTempTable("wiki_content") - def glove(self): - pass + def glove(self): + pass - def category_map(self): - pass + def category_map(self): + pass # ================================================================================ # ADD ADDITIONAL UserVector and ContentVector based on a given data # ================================================================================ + diff --git a/src/utils/save_load.py b/hermes/utils/save_load.py similarity index 100% rename from src/utils/save_load.py rename to hermes/utils/save_load.py diff --git a/scripts/create_file_containing_paths.py b/scripts/create_file_containing_paths.py deleted file mode 100644 index 596997f..0000000 --- a/scripts/create_file_containing_paths.py +++ /dev/null @@ -1,81 +0,0 @@ -"""Create json_paths.txt and schema_paths.txt that you can pass in to hermes. -Outputs: - 1. json_paths.txt: lists all path to JSON files used in hermes - 2. schema_paths.txt: lists all path to schema files used in hermes -""" - -import os -from distutils.util import strtobool - -def file_accessible(filepath, mode): - """Check if a file exists and is accessible.""" - try: - f = open(filepath, mode) - f.close() - except IOError as e: - return False - - return True - -def parse_yn(answer): - answer.upper().strip() - -def main(): - - # create output directory if it did not exist - output_dir = os.path.dirname(os.path.realpath(__file__)) + "/output" - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - # ask user for path to JSON file and its respective schema file - i = 0 - json_paths = [] - schema_paths = [] - is_last = False - while True: - while True: - json_path = raw_input("(" + str(i) + ") Enter path to a JSON file: ") - if file_accessible(json_path.strip(), "r"): - json_paths.append(json_path) - break - else: - print "Please input a JSON file that exists or is accessible." - while True: - schema_path = raw_input("(" + str(i) + ") Enter path to its respective schema file (or empty string if there is no schema): ") - if file_accessible(schema_path.strip(), "r"): - schema_paths.append(schema_path) - break - elif schema_path.strip() == "": - schema_paths.append("") - break - else: - print "Please input a schema file that exists or is accessible." - while True: - add_more = raw_input("Do you need to add more JSON file? [Y/N] ") - try: - if bool(strtobool(add_more.upper().strip())): - i = i + 1 - else: - is_last = True - break - except ValueError: - print "Please respond with a Y or N." - if is_last: - break - - # create a file with a list of JSON file paths - json_file = output_dir + "/json_paths.txt" - with open(json_file, "w") as f: - for json_path in json_paths: - f.write(json_path + "\n") - - # create a file with a list of schema file paths - schema_file = output_dir + "/schema_paths.txt" - with open(schema_file, "w") as f: - for schema_path in schema_paths: - f.write(schema_path + "\n") - - return - -if __name__ == "__main__": - main() From 7d06631e99cfe53057aa19b649ed3f644f090fc6 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Mon, 4 Jan 2016 12:10:33 -0800 Subject: [PATCH 13/39] fix rebase with bookcrossing --- hermes/utils/book_crossing_etl/bookcrossing.py | 10 +++++----- hermes/utils/lastfm_etl/lastfm.py | 12 ++++++------ hermes/utils/save_load.py | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/hermes/utils/book_crossing_etl/bookcrossing.py b/hermes/utils/book_crossing_etl/bookcrossing.py index af806f4..d91a8ec 100755 --- a/hermes/utils/book_crossing_etl/bookcrossing.py +++ b/hermes/utils/book_crossing_etl/bookcrossing.py @@ -217,7 +217,7 @@ def parse_book_line(line): ) parser.add_argument( '-o', - '--output_directory', + '--output-directory', type=str, action="store", help="the directory to save the output JSON files, by default the current directory", @@ -257,8 +257,8 @@ def parse_book_line(line): with\ open(args.ratings, 'rb') as csvfile,\ - open("implicit_ratings.json", 'w') as imp,\ - open("explicit_ratings.json", 'w') as exp: + open("book-crossing_implicit_ratings.json", 'w') as imp,\ + open("book-crossing_explicit_ratings.json", 'w') as exp: for line in iter_lines(csvfile): ret = parse_rating_line(line) @@ -275,11 +275,11 @@ def parse_book_line(line): # outputs. rated_and_valid_users = set(rated_users) - with open("books.json", 'w') as f: + with open("book-crossing_books.json", 'w') as f: for ret in book_data: f.write(json.dumps(ret) + '\n') - with open("users.json", 'w') as f: + with open("book-crossing_users.json", 'w') as f: for ret in users_data: if ret["user_id"] in rated_and_valid_users: f.write(json.dumps(ret) + '\n') diff --git a/hermes/utils/lastfm_etl/lastfm.py b/hermes/utils/lastfm_etl/lastfm.py index 123bfbe..1b0c292 100755 --- a/hermes/utils/lastfm_etl/lastfm.py +++ b/hermes/utils/lastfm_etl/lastfm.py @@ -276,7 +276,7 @@ def parse_plays_line(line): ) parser.add_argument( '-o', - '--output_directory', + '--output-directory', type=str, action="store", help="the directory to save the output JSON files, by default the current directory", @@ -287,11 +287,11 @@ def parse_plays_line(line): # Parse the files processing_queue = ( - (args.artists, args.output_directory + "/artists.json", parse_artist_line), - (args.tags, args.output_directory + "/tags.json", parse_tag_line), - (args.friends, args.output_directory + "/friends.json", parse_friends_line), - (args.applied_tags, args.output_directory + "/applied_tags.json", parse_applied_tag_line), - (args.plays, args.output_directory + "/plays.json", parse_plays_line), + (args.artists, args.output_directory + "/lastfm_artists.json", parse_artist_line), + (args.tags, args.output_directory + "/lastfm_tags.json", parse_tag_line), + (args.friends, args.output_directory + "/lastfm_friends.json", parse_friends_line), + (args.applied_tags, args.output_directory + "/lastfm_applied_tags.json", parse_applied_tag_line), + (args.plays, args.output_directory + "/lastfm_plays.json", parse_plays_line), ) for input_file, output_file, function in processing_queue: with open(input_file, 'rb') as csv_file, open(output_file, 'w') as json_file: diff --git a/hermes/utils/save_load.py b/hermes/utils/save_load.py index 10475ac..f395bff 100644 --- a/hermes/utils/save_load.py +++ b/hermes/utils/save_load.py @@ -95,4 +95,4 @@ def save_cv_to_hadoop(vector, output_name): def load_cv_from_hadoop(input_name,sc, num_partitions=20): cv = sc.pickleFile(input_name).repartition(num_partitions) - return cv \ No newline at end of file + return cv From d41849f0ce78298c0f7037ce81e8424de76e5b4a Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Mon, 4 Jan 2016 12:26:42 -0800 Subject: [PATCH 14/39] revise to raise NotImplemented error for functions not yet implemented instead of a pass --- hermes/modules/metricgenerator.py | 5 ++--- hermes/modules/recommendergenerator.py | 10 +++++++--- hermes/modules/vectorgenerator.py | 4 ++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/hermes/modules/metricgenerator.py b/hermes/modules/metricgenerator.py index 879df20..fb74594 100644 --- a/hermes/modules/metricgenerator.py +++ b/hermes/modules/metricgenerator.py @@ -42,7 +42,7 @@ def create_obj_metric(self, metric_str): class Metric: def calculate_metric(self, vector=None) : - pass + raise NotImplemented class RMSE(Metric): def calculate_metric(self, vector): @@ -53,8 +53,7 @@ def calculate_metric(self, vector): return pm.calculate_mae(vector.test_vector, vector.prediction_vector) class PRFS(Metric): - def calculate_metric(self): - raise NotImplemented + pass diff --git a/hermes/modules/recommendergenerator.py b/hermes/modules/recommendergenerator.py index 8943999..030b1e7 100644 --- a/hermes/modules/recommendergenerator.py +++ b/hermes/modules/recommendergenerator.py @@ -42,9 +42,12 @@ def make_prediction_with_cbkmeans(self): # TODO: ask Anna for the specifics class WithTfidf(ImplementationInterface): + """ + # TODO def make_prediction_with_cbkmeans(self, vector): # create CB with K-means with tf-idf - pass + raise NotImplemented + """ class WithoutTfidf(ImplementationInterface): def make_prediction_with_als(self, vector): @@ -54,10 +57,11 @@ def make_prediction_with_als(self, vector): prediction_vector = model.predictAll( vector.test_vector.map( lambda x: (x[0], x[1]) ) ).cache() return prediction_vector - + """ def make_prediction_with_cbkmeans(self, vector): # create CB with K-means without tf-idf - pass + raise NotImplemented + """ # ================================================================================ # Bridge: bridge target interface & background implementation diff --git a/hermes/modules/vectorgenerator.py b/hermes/modules/vectorgenerator.py index 8214e06..6e6a59d 100644 --- a/hermes/modules/vectorgenerator.py +++ b/hermes/modules/vectorgenerator.py @@ -140,10 +140,10 @@ def __init__(self): self.filtered_content.registerTempTable("wiki_content") def glove(self): - pass + raise NotImplemented def category_map(self): - pass + raise NotImplemented # ================================================================================ # ADD ADDITIONAL UserVector and ContentVector based on a given data From 6e9bdf6f2404373e68e54845db7651802c853312 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Mon, 4 Jan 2016 14:58:02 -0800 Subject: [PATCH 15/39] fix logging for INFO that prints twice --- hermes/hermes.py | 1 + hermes/hermesctl.py | 6 +----- hermes/modules/recommendergenerator.py | 2 +- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/hermes/hermes.py b/hermes/hermes.py index 7c0c83a..6db6120 100644 --- a/hermes/hermes.py +++ b/hermes/hermes.py @@ -96,6 +96,7 @@ def make_prediction_state(cargo): for i in range(0, len(cargo.vectors)): for r in cargo.recommenders: + if Globals.verbose: Globals.logger.debug("Making recommendation %s on data %s", r, cargo.vectors[i].data.datapath) # TODO: implement other implementations, ie. WithTfidf(), etc. # default is WithoutTfidf() recommender = rg.RecommenderFactory().create_obj_recommender(r, cargo.vectors[i]) diff --git a/hermes/hermesctl.py b/hermes/hermesctl.py index 521e0dd..e78d952 100644 --- a/hermes/hermesctl.py +++ b/hermes/hermesctl.py @@ -56,8 +56,6 @@ def create_logger(name): che = logging.StreamHandler() che.setLevel(logging.ERROR) # create console handler for stdout for info, debug, and error level - choi = logging.StreamHandler(sys.stdout) - choi.setLevel(logging.INFO) chod = logging.StreamHandler(sys.stdout) chod.setLevel(logging.DEBUG) choe = logging.StreamHandler(sys.stdout) @@ -66,13 +64,11 @@ def create_logger(name): formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") fh.setFormatter(formatter) che.setFormatter(formatter) - choi.setFormatter(formatter) chod.setFormatter(formatter) choe.setFormatter(formatter) # add handlers to logger logger.addHandler(fh) logger.addHandler(che) - logger.addHandler(choi) logger.addHandler(chod) logger.addHandler(choe) @@ -311,7 +307,7 @@ def main(verbose, hdfs_dir, fs_default_ip_addr, list_of_files_config, configs): extract_configs(configs, list_of_files_config, cargo) # run state machine - stateMachine.run(cargo) + state_machine.run(cargo) diff --git a/hermes/modules/recommendergenerator.py b/hermes/modules/recommendergenerator.py index 030b1e7..ab73681 100644 --- a/hermes/modules/recommendergenerator.py +++ b/hermes/modules/recommendergenerator.py @@ -17,7 +17,6 @@ """ -import logging import sys import timer import pyspark.mllib.recommendation as mllib @@ -58,6 +57,7 @@ def make_prediction_with_als(self, vector): return prediction_vector """ + # TODO def make_prediction_with_cbkmeans(self, vector): # create CB with K-means without tf-idf raise NotImplemented From d33026dc58f8c65da60f0789caa469387bb7d9cf Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Mon, 4 Jan 2016 15:54:02 -0800 Subject: [PATCH 16/39] change vectorizer to dataname to be more clear --- docs/configs.md | 22 +++++++++++----------- docs/data_supported.md | 16 ++++++++-------- docs/framework.md | 12 +++++++----- docs/glossary.md | 4 +++- hermes/configs/movielens_config.ini | 2 +- hermes/configs/wiki_config.ini | 2 +- hermes/hermesctl.py | 20 ++++++++++---------- hermes/modules/config.py | 2 +- hermes/modules/data.py | 12 ++++++------ hermes/modules/vectorgenerator.py | 8 ++++---- 10 files changed, 52 insertions(+), 48 deletions(-) diff --git a/docs/configs.md b/docs/configs.md index 09e2810..8c69c77 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -1,11 +1,11 @@ # Hermes's Configuration Files Explained * [List of Files Standard](#list-of-files-standard) - * [Vectorizer](#vectorizer) + * [Dataname](#dataname) * [JSON Paths](#json-paths) * [Configuration File Standard](#configuration-file-standard) * [Datasets](#datasets) - * [Vectorizer](#vectorizer) + * [Dataname](#dataname) * [Vectors](#vectors) * [Optional Variables: Schemas & Support Files](#optional-variables) * [Recommenders](#recommenders) @@ -58,17 +58,17 @@ movielens_1m_tags_schema = /path/to/your/movielens/1m/tags.json_schema.gz movielens_1m_movies_schema = /path/to/your/movielens/1m/movies_schema.json.gz ``` -### Vectorizer +### Dataname -A single data can be split into multiple JSON files. In this case, [movielens] is a data that is split into multiple JSON files. For lack of a better term, we call [movielens] a "vectorizer" variable. There can be multiple vectorizers in a list of files (ie. list_of_files.ini), but there can only be one vectorizer in a configuration file (ie. config.ini). +A single data can be split into multiple JSON files. In this case, [movielens] is a data that is split into multiple JSON files. For lack of a better term, we call [movielens] a "dataname" variable. There can be multiple datanames in a list of files (ie. list_of_files.ini), but there can only be one dataname in a configuration file (ie. config.ini). -Vectorizer plays an important role in that we know which data each JSON file is coming from. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance() for each data instantiated class. What is checked in isSameDataInstance() has to match the vectorizer exactly. If it did not, Hermes will throw an error message. +Dataname plays an important role in that we know which data each JSON file is coming from. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance() for each data instantiated class. What is checked in isSameDataInstance() has to match the dataname exactly. If it did not, Hermes will throw an error message. -For example, in the case of the Movie Lens data, its vectorizer is "movielens". The check in the class MovieLens's isSameDataInstance() function will check that vectorizer is equal to "movielens". If you passed [MovieLens] to list_of_files.ini, for example, and the check in isSameDataInstance() is "movielens", it will fail. However, if you passed [movielens] to list_of_files.ini and the check in isSameDataInstance() is "movielens", it will pass. +For example, in the case of the Movie Lens data, its dataname is "movielens". The check in the class MovieLens's isSameDataInstance() function will check that dataname is equal to "movielens". If you passed [MovieLens] to list_of_files.ini, for example, and the check in isSameDataInstance() is "movielens", it will fail. However, if you passed [movielens] to list_of_files.ini and the check in isSameDataInstance() is "movielens", it will pass. ### JSON Paths -Underneath the vectorizer heading, each variable (ie. movielens_20m_ratings, movielens_20m_tags, etc.) is a shorthand name for a specific JSON file. These variables will store the path to their individual JSON file. They will be used in the configuration file (ie. config.ini) as input to user_vector_data and content_vector_data variable. +Underneath the dataname heading, each variable (ie. movielens_20m_ratings, movielens_20m_tags, etc.) is a shorthand name for a specific JSON file. These variables will store the path to their individual JSON file. They will be used in the configuration file (ie. config.ini) as input to user_vector_data and content_vector_data variable. ## Configuration File Standard @@ -82,7 +82,7 @@ Let's take a look at an example file called config.ini. ```bash [datasets] -vectorizer = movielens +dataname = movielens # user vector user_vector_data = ["movielens_10m_ratings", "movielens_20m_ratings"] @@ -104,11 +104,11 @@ metrics = ["RMSE", "MAE"] ### Datasets -Datasets specify which data we are going to use. It contains vectorizer, user or content vectors, and support files. +Datasets specify which data we are going to use. It contains dataname, user or content vectors, and support files. -#### Vectorizer +#### Dataname -One configuration file can specify only one vectorizer. Vectorizer is the name of the data where each JSON file is derived from. +One configuration file can specify only one dataname. Dataname is the name of the data where each JSON file is derived from. #### Vectors diff --git a/docs/data_supported.md b/docs/data_supported.md index 674cce4..74d630a 100644 --- a/docs/data_supported.md +++ b/docs/data_supported.md @@ -28,10 +28,10 @@ Before continuing, it might be beneficial if you understand the Hermes's framewo ### Configuration Files For JSON files derived from Movie Lens data, you need to specify the following: -* In configuration file, specify vectorizer = movielens +* In configuration file, specify dataname = movielens * In configuraiton file that lists all JSON files, specify section as [movielens] -As long as the vectorizer check matches with the vectorizer given in the configuration files, Hermes will recognize it as a Movie Lens data. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance(). What is checked in isSameDataInstance() has to match the vectorizer exactly. If it did not, Hermes will throw an error message. In this case, vectorizer has to match "movielens" exactly to recognize that this is a Movie Lens data. +As long as the dataname check matches with the dataname given in the configuration files, Hermes will recognize it as a Movie Lens data. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance(). What is checked in isSameDataInstance() has to match the dataname exactly. If it did not, Hermes will throw an error message. In this case, dataname has to match "movielens" exactly to recognize that this is a Movie Lens data. ### Vector Transformation for User Vector @@ -51,10 +51,10 @@ You can specify the vector transformation on a content vector by specifying cont ### Configuration Files For JSON files derived from Wikipedia data, you need to specify the following: -* In configuration file, specify vectorizer = wiki +* In configuration file, specify dataname = wiki * In configuration file that lists all JSON files, specify section as [wiki] -As long as the vectorizer check matches with the vectorizer given in the configuration files, Hermes will recognize it as a Wikipedia data. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance(). What is checked in isSameDataInstance() has to match the vectorizer exactly. If it did not, Hermes will throw an error message. In this case, vectorizer has to match "wiki" exactly to recognize that this is a Wikipedia data. +As long as the dataname check matches with the dataname given in the configuration files, Hermes will recognize it as a Wikipedia data. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance(). What is checked in isSameDataInstance() has to match the dataname exactly. If it did not, Hermes will throw an error message. In this case, dataname has to match "wiki" exactly to recognize that this is a Wikipedia data. #### Vector Transformation for User Vector @@ -82,7 +82,7 @@ Template: class NewDataset(object): @classmethod def isSameDataInstance(cls, comparisonData): - return comparisonData.vectorizer == "new_dataset_vectorizer_name" + return comparisonData.dataname == "new_dataset_dataname_name" class NewDatasetUserVector(UserVector, NewDataset): def user_vector_transformation_1(self): @@ -108,7 +108,7 @@ class NewDatasetContentVector(ContentVector, NewDataset): 1. Instantiate a class for your dataset. In this case, it is specified as class NewDataset. 2. Instantiate a User Vector and a Content Vector class for your dataset that inherits from your dataset class and UserVector or Content Vector respectively. In this case, the UserVector for NewDataset is called NewDataSetUserVector, and the ContentVector for NewDataset is called NewDataContentVector. -3. Provide the vectorizer name for the check in isSameDataInstance(). In this case, vectorizer is checked if it's equal to "new_dataset_vectorizer_name". +3. Provide the dataname name for the check in isSameDataInstance(). In this case, dataname is checked if it's equal to "new_dataset_dataname_name". 4. Provide the vector transformation logic for each type of vectors. For User Vector transformations, define the function in the class NewDatasetUserVector. In this case, these vector transformations are user_vector_transformation_1, user_vector_transformation_2, and user_vector_transformation_n. For Content Vector transformations, define the function in the class NewDatasetContentVector. In this case, the vector transformation is content_vector_trasnformation_1. 5. Additional support files needed for the vector transformation is passed down from the configuration file as self.support_files. self.support_files is a dictionary with the key as a variable and the value as the value received in the configuration file. Please read on the [configuration file guide](https://github.com/Lab41/hermes/tree/master/docs/configs.md#optional-variables) for more details. @@ -116,7 +116,7 @@ After you have defined the concrete implementation of the new dataset, you can n In list_of_files.ini: ```bash -[new_dataset_vectorizer_name] +[new_dataset_dataname_name] new_dataset_10m_ratings = /path/to/your/new/dataset/10m/ratings.json.gz new_dataset_20m_ratings = /path/to/your/new/dataset/20m/ratings.json.gz new_dataset_10m_ratings_schema = /path/to/your/new/dataset/10m/ratings_schema.json.gz @@ -129,7 +129,7 @@ new_dataset_10m_movies_schema = /path/to/your/new/dataset/10m/movies_schema.json In new_dataset_config.ini: ```bash [datasets] -vectorizer = new_dataset_vectorizer_name +dataname = new_dataset_dataname_name # user vector user_vector_data = ["new_dataset_10m_ratings", "new_dataset_20m_ratings"] diff --git a/docs/framework.md b/docs/framework.md index f3a7692..528337e 100644 --- a/docs/framework.md +++ b/docs/framework.md @@ -117,7 +117,7 @@ We employ the logging library to log INFO, DEBUG, and ERROR messages. The logger All INFO messages are outputted to the command line. -ALL DEBUG messages are outputted to the command line and a log file called hermes.log. hermes.log is created wherever the hermes binary is run. Debug messages will only print when the --verbose option is passed. +ALL DEBUG messages are outputted to the command line and a log file called hermes.log. hermes.log is created whenever the hermes binary is run. Debug messages will only print when the --verbose option is passed. ALL ERROR messages are outputted to the command line and stderr. @@ -149,6 +149,8 @@ To add an end state, add the following line in hermesctl's add_states(): state_machine.add_state(hermes.new_state, isEndState=True) ``` +Make sure you define your state as well; otherwise, the framework will output an error. Please follow the instructions in [Defining a New State](#defining-a-new-state). + #### Adding New Variables in Cargo Cargo is the object passed around in the state machine. Since we can never know until runtime where each state has derived from and where it will go next, we do not know what parameters to pass into each state. Cargo encapsulates all the parameters needed for each state in one object. It is defined in [cargo.py](#cargopy) and instantiated in hermesctl's main(). Future implementation will clean up Cargo so that one state does not know what another state's parameter needs are unless necessary (TODO: in development). @@ -163,7 +165,7 @@ Configuration Files are currently extracted via the ConfigParser library. In the Listed below are recognized sections and their respective items: * datasets - * vectorizer + * dataname * user_vector_data * user_vector_transformations * user_vector_schemas @@ -177,7 +179,7 @@ Listed below are recognized sections and their respective items: What Hermes will do when it encounters unrecognized section or section's item: * If it does not recognize the section, it will skip the entire section. -* In datasets section, if vectorizer is not specified, it will quit the program. +* In datasets section, if dataname is not specified, it will quit the program. * In datasets section, if User Vector (user_vector_data, user_vector_transformation) or Content Vector (content_vector_data, content_vector_transformation) or both are not specified, it will quit the program. In the future, it will also quit the program if it does not have User Vector and Content Vector specified when Content Vector is already specified (TODO: in development). * Any other items in datasets that are not recognized are treated as a support_file item, meaning the variable is placed as a key and its value is placed as a value in a dictionary called support_files to be used later when generating the vector. * In recommenders section, any items that are not recognized will be skipped. In the future, extra parameter variables needed for recommender system algorithms will be recognized (TODO: in development). @@ -331,7 +333,7 @@ Every vector type inherits from the Vector class, meaning all User Vector and Co Since each data requires its own specific vector transformation, every data has its own class as well as its own UserVector and ContentVector. The data's UserVector and ContentVector inherit from both the data's own class as well as UserVector or ContentVector respectively. The data's UserVector and ContentVector have functions defined in their class to execute vector transformation. The name of these functions has to match the name of the vector transformation passed in via the configuration file in order for the vector transformation to occur. -Vectorizer is a variable used in configuration file to refer to the data where each JSON file is coming from. The data's own class has a check function called isSameDataInstance() to verify that the vectorizer passed in via the configuration file is describing about the same data as data's own class. +Vectorizer is a variable used in configuration file to refer to the data where each JSON file is coming from. The data's own class has a check function called isSameDataInstance() to verify that the dataname passed in via the configuration file is describing about the same data as data's own class. To automatically create a vector (ie. which vector type and from which data), VectorFactory is there to the rescue! It can either return a Vector object or the RDD / vector itself by calling VectorFactory().create_obj_vector(...) or VectorFactory().create_vector(...) respectively. @@ -357,7 +359,7 @@ For example: if you wanted to create a vector transformation for MovieLens data' class MovieLens(object): @classmethod def isSameDataInstance(cls, comparisonData): - return comparisonData.vectorizer == "movielens" + return comparisonData.dataname == "movielens" class MovieLensUserVector(UserVector, MovieLens): def ratings(self): diff --git a/docs/glossary.md b/docs/glossary.md index e9ad1fe..f441251 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -12,6 +12,8 @@ This is a glossary of common terms used in Hermes and their specified meaning. ## D **Dataframe**: A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. (Excerpt taken from Spark's SQL Programming Guide). In Hermes, the dataframe variable defined in the Data class refers to the dataframe created after reading in the JSON file. +**Dataname**: Dataname is a variable used in configuration file to refer to the data where each JSON file is coming from. + ## E ## F @@ -71,7 +73,7 @@ This is a glossary of common terms used in Hermes and their specified meaning. **Vector Type**: Hermes separates vectors into two distinct types: User Vector and Content Vector. User Vector refers to the vector describing users in the data. Content Vector refers to the vector describing content in the data. Users can implement other vector types as needed if User Vector and Content Vector does not describe the vector they are building. -**Vectorizer**: Vectorizer is a variable used in configuration file to refer to the data where each JSON file is coming from. +**Vectorizer**: see "Dataname". This is a deprecated name used before we decided to stick with "Dataname". ## W diff --git a/hermes/configs/movielens_config.ini b/hermes/configs/movielens_config.ini index 265eaf2..8b83aae 100644 --- a/hermes/configs/movielens_config.ini +++ b/hermes/configs/movielens_config.ini @@ -1,5 +1,5 @@ [datasets] -vectorizer = movielens +dataname = movielens user_vector_data = ["movielens_10m_ratings"] user_vector_schemas = ["movielens_10m_ratings_schema"] user_vector_transformations = ["ratings"] diff --git a/hermes/configs/wiki_config.ini b/hermes/configs/wiki_config.ini index 723a61a..56a5800 100644 --- a/hermes/configs/wiki_config.ini +++ b/hermes/configs/wiki_config.ini @@ -3,7 +3,7 @@ output_directory = /output/wikipedia_cbkmeans [datasets] -vectorizer = wiki +dataname = wiki user_vector_data = ["edit_history"] user_vector_transformations = ["num_edits_ceil"] content_vector_data = ["full_text"] diff --git a/hermes/hermesctl.py b/hermes/hermesctl.py index e78d952..a9c3afd 100644 --- a/hermes/hermesctl.py +++ b/hermes/hermesctl.py @@ -134,32 +134,32 @@ def handle_dataset_section(dataset_items, config_path): """ Helper function that handles [datasets] section. """ # TODO: which is better? iterating through sections then items or iterating through just items of list_of_files_config? - # make sure vectorizer is initialized in order to verify the section in list_of_files_config - if not ("vectorizer" in datasets_items.keys()): - Globals.logger.error("ERROR: config " + config_path + " must have vectorizer specified.") + # make sure dataname is initialized in order to verify the section in list_of_files_config + if not ("dataname" in datasets_items.keys()): + Globals.logger.error("ERROR: config " + config_path + " must have dataname specified.") sys.exit() - vectorizer = datasets_items["vectorizer"] - lofmap = config.map_section(lofcp, vectorizer) + dataname = datasets_items["dataname"] + lofmap = config.map_section(lofcp, dataname) # create UserVectorData or ContentVectorData or both hasUserVector = False # check it has the required items to build a UserVectorData if set(config.REQ_UV_HEADINGS) < set(datasets_items.keys()): hasUserVector = True - create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector=True) + create_datas(lofmap, dataname, datasets_items, config_path, isUserVector=True) hasContentVector = False # check it has the required items to build a ContentVectorData if set(config.REQ_CV_HEADINGS) < set(datasets_items.keys()): hasContentVector = True - create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector=False) + create_datas(lofmap, dataname, datasets_items, config_path, isUserVector=False) if not hasUserVector and not hasContentVector: Globals.logger.error("ERROR: config " + config_path + " does not have declaration for a user vector or a content vector") sys.exit() - def create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector): + def create_datas(lofmap, dataname, datasets_items, config_path, isUserVector): """ Helper function that creates a UserVectorData or ContentVectorData depending if it isUserVector or not. Storing configuration for UserVector or ContentVector in an object (like UserVectorData and ContentVectorData) @@ -205,10 +205,10 @@ def create_datas(lofmap, vectorizer, datasets_items, config_path, isUserVector): schemapath = None if isUserVector: - uservectordata = UserVectorData(datapath, vector_transformation, schemapath, vectorizer) + uservectordata = UserVectorData(datapath, vector_transformation, schemapath, dataname) cargo.datas.append(uservectordata) else: - contentvectordata = ContentVectorData(datapath, vector_transformation, schemapath, vectorizer) + contentvectordata = ContentVectorData(datapath, vector_transformation, schemapath, dataname) cargo.datas.append(contentvectordata) # extract configs diff --git a/hermes/modules/config.py b/hermes/modules/config.py index 3de0400..102087c 100644 --- a/hermes/modules/config.py +++ b/hermes/modules/config.py @@ -5,7 +5,7 @@ REQ_CV_HEADINGS = ("content_vector_data", "content_vector_transformations") CV_HEADINGS = () + REQ_CV_HEADINGS + ("content_vector_schemas",) -DATASETS_HEADINGS = ("vectorizer",) + UV_HEADINGS + CV_HEADINGS +DATASETS_HEADINGS = ("dataname",) + UV_HEADINGS + CV_HEADINGS HEADINGS = { "datasets": DATASETS_HEADINGS, \ "recommenders": ("recommenders"), \ diff --git a/hermes/modules/data.py b/hermes/modules/data.py index 5b74d3b..b2c0c91 100644 --- a/hermes/modules/data.py +++ b/hermes/modules/data.py @@ -5,10 +5,10 @@ class Data(object): """ Store configuration from configuration files. """ - def __init__(self, datapath, vector_transformation, schemapath, vectorizer): + def __init__(self, datapath, vector_transformation, schemapath, dataname): if helper.is_filepath_valid(datapath): self.datapath = datapath - self.vectorizer = vectorizer + self.dataname = dataname self.vector_transformation = vector_transformation self.schema = helper.get_schema(schemapath) self.dataframe = None @@ -26,13 +26,13 @@ def set_dataframe(self, scsingleton, datapath_in_hdfs): self.schema = self.dataframe.schema class UserVectorData(Data): - def __init__(self, datapath, vector_transformation, schemapath, vectorizer): - super(self.__class__, self).__init__(datapath, vector_transformation, schemapath, vectorizer) + def __init__(self, datapath, vector_transformation, schemapath, dataname): + super(self.__class__, self).__init__(datapath, vector_transformation, schemapath, dataname) self.which_vector = vectorgenerator.UserVector class ContentVectorData(Data): - def __init__(self, datapath, vector_transformation, schemapath, vectorizer): - super(self.__class__, self).__init__(datapath, vector_transformation, schemapath, vectorizer) + def __init__(self, datapath, vector_transformation, schemapath, dataname): + super(self.__class__, self).__init__(datapath, vector_transformation, schemapath, dataname) self.which_vector = vectorgenerator.ContentVector diff --git a/hermes/modules/vectorgenerator.py b/hermes/modules/vectorgenerator.py index 6e6a59d..40ff126 100644 --- a/hermes/modules/vectorgenerator.py +++ b/hermes/modules/vectorgenerator.py @@ -70,7 +70,7 @@ class ContentVector(Vector): class MovieLens(object): @classmethod def isSameDataInstance(cls, comparisonData): - return comparisonData.vectorizer == "movielens" + return comparisonData.dataname == "movielens" class MovieLensUserVector(UserVector, MovieLens): def ratings(self): @@ -84,7 +84,7 @@ def ratings_to_interact(self): class MovieLensContentVector(ContentVector, MovieLens): def genre(self): - def genre_vectorizer(row): + def genre_dataname(row): return np.array(( int(row.genre_action), int(row.genre_adventure), @@ -105,7 +105,7 @@ def genre_vectorizer(row): int(row.genre_war), int(row.genre_western), )) - return self.data.dataframe.map(lambda row: (row.movie_id, genre_vectorizer(row))) + return self.data.dataframe.map(lambda row: (row.movie_id, genre_dataname(row))) # ================================================================================ # Wiki @@ -115,7 +115,7 @@ def genre_vectorizer(row): class Wiki(object): @classmethod def isSameDataInstance(cls, comparisonData): - return comparisonData.vectorizer == "wiki" + return comparisonData.dataname == "wiki" class WikiUserVector(UserVector, Wiki): def __init__(self): From 55ebd993c7c097b2a477b371ba60349b589a98e8 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Tue, 5 Jan 2016 14:42:50 -0800 Subject: [PATCH 17/39] rename genre_dataname() to get_genre() for clarity --- hermes/modules/vectorgenerator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hermes/modules/vectorgenerator.py b/hermes/modules/vectorgenerator.py index 40ff126..ad1e7b7 100644 --- a/hermes/modules/vectorgenerator.py +++ b/hermes/modules/vectorgenerator.py @@ -84,7 +84,7 @@ def ratings_to_interact(self): class MovieLensContentVector(ContentVector, MovieLens): def genre(self): - def genre_dataname(row): + def get_genre(row): return np.array(( int(row.genre_action), int(row.genre_adventure), @@ -105,7 +105,7 @@ def genre_dataname(row): int(row.genre_war), int(row.genre_western), )) - return self.data.dataframe.map(lambda row: (row.movie_id, genre_dataname(row))) + return self.data.dataframe.map(lambda row: (row.movie_id, get_genre(row))) # ================================================================================ # Wiki From d9eb4e10e524b4b38721cf0fca1d08d5cca5a731 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Tue, 5 Jan 2016 15:01:35 -0800 Subject: [PATCH 18/39] add more glossary terms --- docs/data_supported.md | 2 + docs/framework.md | 103 +++++++++++++++++++++++++++++++++++++++-- docs/glossary.md | 2 + 3 files changed, 102 insertions(+), 5 deletions(-) diff --git a/docs/data_supported.md b/docs/data_supported.md index 74d630a..b0fb332 100644 --- a/docs/data_supported.md +++ b/docs/data_supported.md @@ -74,6 +74,8 @@ You can specify the vector transformation on a content vector by specifying cont ## Adding New Datasets +This excerpt is taken out from [Understanding Hermes's Framework](https://github.com/Lab41/hermes/tree/master/docs/framework.md#adding-new-datasets). + Currently, adding new dataset will require you to append the logic (see template below) in hermes/hermes/modules/vectorgenerator.py. To make it easier for the user, in the future, every time you add a new dataset, you will need to create a new file. The template for supporting an additional dataset is shown below. Template: diff --git a/docs/framework.md b/docs/framework.md index 528337e..08c7d94 100644 --- a/docs/framework.md +++ b/docs/framework.md @@ -75,11 +75,11 @@ Reading this entire article will give you the complete understanding of what the * [Handling Multiple Next States](#handling-multiple-next-states) * If you are planning to use your own dataset not yet supported by Hermes, please read: * [Understanding What Vectors Are](#understanding-what-vectors-are) - * [Datasets Supported](https://github.com/Lab41/hermes/tree/master/docs/data_supported.md), in particular Adding New Datasets section. + * [Datasets Supported](https://github.com/Lab41/hermes/tree/master/docs/data_supported.md), in particular [Adding New Dataset](#adding-new-dataset). * If you are planning to use your own recommender system algorithms not yet supported by Hermes, please read: - * [Recommender System Algorithms Supported](https://github.com/Lab41/hermes/tree/master/docs/recommenders_supported.md), in particular Adding New Recommender System Algorithm section. + * [Recommender System Algorithms Supported](https://github.com/Lab41/hermes/tree/master/docs/recommenders_supported.md), in particular [Adding New Recommender System Algorithms](#adding-new-recommender-system-algorithms). * If you are planning to use your own metrics not yet supported by Hermes, please read: - * [Metrics Supported](https://github.com/Lab41/hermes/tree/master/docs/metrics_supported.md), in particular Adding New Metric section. + * [Metrics Supported](https://github.com/Lab41/hermes/tree/master/docs/metrics_supported.md), in particular [Adding New Metric](#adding-new-metric). ## Main Components @@ -333,7 +333,7 @@ Every vector type inherits from the Vector class, meaning all User Vector and Co Since each data requires its own specific vector transformation, every data has its own class as well as its own UserVector and ContentVector. The data's UserVector and ContentVector inherit from both the data's own class as well as UserVector or ContentVector respectively. The data's UserVector and ContentVector have functions defined in their class to execute vector transformation. The name of these functions has to match the name of the vector transformation passed in via the configuration file in order for the vector transformation to occur. -Vectorizer is a variable used in configuration file to refer to the data where each JSON file is coming from. The data's own class has a check function called isSameDataInstance() to verify that the dataname passed in via the configuration file is describing about the same data as data's own class. +Dataname is a variable used in configuration file to refer to the data where each JSON file is coming from. The data's own class has a check function called isSameDataInstance() to verify that the dataname passed in via the configuration file is describing about the same data as data's own class. To automatically create a vector (ie. which vector type and from which data), VectorFactory is there to the rescue! It can either return a Vector object or the RDD / vector itself by calling VectorFactory().create_obj_vector(...) or VectorFactory().create_vector(...) respectively. @@ -348,7 +348,96 @@ class MyNewVectorType(Vector): #### Adding New Dataset -Please read [Datasets Supported's section on Adding New Datasets](https://github.com/Lab41/hermes/tree/master/docs/data_supported.md#adding-new-datasets). +Same explanation can be found in [Datasets Supported's section on Adding New Datasets](https://github.com/Lab41/hermes/tree/master/docs/data_supported.md#adding-new-datasets). + +Currently, adding new dataset will require you to append the logic (see template below) in hermes/hermes/modules/vectorgenerator.py. To make it easier for the user, in the future, every time you add a new dataset, you will need to create a new file. The template for supporting an additional dataset is shown below. + +Template: + +```bash +class NewDataset(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.dataname == "new_dataset_dataname_name" + +class NewDatasetUserVector(UserVector, NewDataset): + def user_vector_transformation_1(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) + + def user_vector_transformation_2(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)).filter(lambda (u, m, r): r > 3) + + def user_vector_transformation_n(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, -1 if row.rating < 3 else 1)) + +class NewDatasetContentVector(ContentVector, NewDataset): + def content_vector_transformation_1(self): + def internal_helper_function(row): + return np.array(( + int(row.genre_action), + int(row.genre_adventure), + int(row.genre_animation), + )) + return self.data.dataframe.map(lambda row: (row.movie_id, internal_helper_function(row))) + +``` + +1. Instantiate a class for your dataset. In this case, it is specified as class NewDataset. +2. Instantiate a User Vector and a Content Vector class for your dataset that inherits from your dataset class and UserVector or Content Vector respectively. In this case, the UserVector for NewDataset is called NewDataSetUserVector, and the ContentVector for NewDataset is called NewDataContentVector. +3. Provide the dataname name for the check in isSameDataInstance(). In this case, dataname is checked if it's equal to "new_dataset_dataname_name". +4. Provide the vector transformation logic for each type of vectors. For User Vector transformations, define the function in the class NewDatasetUserVector. In this case, these vector transformations are user_vector_transformation_1, user_vector_transformation_2, and user_vector_transformation_n. For Content Vector transformations, define the function in the class NewDatasetContentVector. In this case, the vector transformation is content_vector_trasnformation_1. +5. Additional support files needed for the vector transformation is passed down from the configuration file as self.support_files. self.support_files is a dictionary with the key as a variable and the value as the value received in the configuration file. Please read on the [configuration file guide](https://github.com/Lab41/hermes/tree/master/docs/configs.md#optional-variables) for more details. + +After you have defined the concrete implementation of the new dataset, you can now use the dataset and apply multiple recommender system algorithms and metrics. + +In list_of_files.ini: +```bash +[new_dataset_dataname_name] +new_dataset_10m_ratings = /path/to/your/new/dataset/10m/ratings.json.gz +new_dataset_20m_ratings = /path/to/your/new/dataset/20m/ratings.json.gz +new_dataset_10m_ratings_schema = /path/to/your/new/dataset/10m/ratings_schema.json.gz +new_dataset_20m_ratings_schema = /path/to/your/new/dataset/20m/ratings_schema.json.gz + +new_dataset_10m_movies = /path/to/your/new/dataset/10m/movies.json.gz +new_dataset_10m_movies_schema = /path/to/your/new/dataset/10m/movies_schema.json.gz +``` + +In new_dataset_config.ini: +```bash +[datasets] +dataname = new_dataset_dataname_name + +# user vector +user_vector_data = ["new_dataset_10m_ratings", "new_dataset_20m_ratings"] +user_vector_schemas = ["new_dataset_10m_ratings_schema", "new_dataset_20m_ratings_schema"] +user_vector_transformations = ["user_vector_transformation_1", "user_vector_transformation_2"] + +# content vector +content_vector_data = ["new_dataset_10m_movies"] +content_vector_schema = ["new_dataset_10m_movies_schema"] +content_vector_transformations = ["content_vector_trasnformation_1"] + +[recommenders] +user_recommenders = ["ALS"] +content_recommenders = ["CBWithKMeans"] + +[metrics] +metrics = ["RMSE", "MAE"] +``` + +When you run hermes with the above configuration, the following will happen: +* user_vector_transformation_1 will be applied to new_dataset_10m_ratings. +* user_vector_transformation_2 will be applied to new_dataset_20m_ratings. +* content_vector_transformation_1 will be applied to new_dataset_10m_movies. +* ALS will be applied to UserVector of new_dataset_10m_ratings. +* ALS will be applied to UserVector of new_dataset_20m_ratings. +* CBWithKMeans will be applied to ContentVector of new_dataset_10m_movies. +* RMSE will be applied to UserVector of new_dataset_10m_ratings after ALS has been subjected to it. +* RMSE will be applied to UserVector of new_dataset_20m_ratings after ALS has been subjected to it. +* RMSE will be applied to ContentVector of new_dataset_10m_ratings after CBWithKMeans has been subjected to it. +* MAE will be applied to UserVector of new_dataset_10m_ratings after ALS has been subjected to it. +* MAE will be applied to UserVector of new_dataset_20m_ratings after ALS has been subjected to it. +* MAE will be applied to ContentVector of new_dataset_10m_ratings after CBWithKMeans has been subjected to it. #### Adding New Vector Transformation @@ -410,6 +499,8 @@ prediction_vector = recommender.make_prediction() #### Adding New Recommender System Algorithms +Same explanation can be found in [Recommenders Supported's section on Adding New Recommender System Algorithms](https://github.com/Lab41/hermes/tree/master/docs/recommenders_supported.md#adding-new-recommender-system-algorithms). + To add a new recommender system algorithm, instantiate a class that inherits from Recommender class and defines the make_prediction() function that calls on the recommender system algorithm's own make prediction function. ```bash @@ -509,6 +600,8 @@ MetricFactory() is a class that will automatically instantiate which metric depe #### Adding New Metric +Same explanation can be found in [Metrics Supported's section on Adding New Metric](https://github.com/Lab41/hermes/tree/master/docs/metrics_supported.md#adding-new-metric). + To add a new metric, create a class that inherits from the Metric class and define a calculate_metric function in the class. ```bash diff --git a/docs/glossary.md b/docs/glossary.md index f441251..8af7921 100644 --- a/docs/glossary.md +++ b/docs/glossary.md @@ -7,6 +7,8 @@ This is a glossary of common terms used in Hermes and their specified meaning. ## B ## C +**Cargo**: Cargo is the object passed around in the state machine. Since we can never know until runtime where each state has derived from and where it will go next, we do not know what parameters to pass into each state. Cargo encapsulates all the parameters needed for each state in one object. It is defined in cargo.py and instantiated in hermesctl's main(). + **Content Vector**: Content Vector refers to the vector describing the content in the data. ## D From 2cc02eecdb74eed1887869470dc07ecd34925b74 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Wed, 6 Jan 2016 15:34:21 -0800 Subject: [PATCH 19/39] separate vector generation of individual datasets in separate files --- docs/assumptions.md | 10 ++ docs/framework.md | 2 + hermes/hermes.py | 2 +- hermes/hermesctl.py | 2 +- hermes/modules/data.py | 2 +- hermes/modules/globals.py | 11 -- hermes/modules/helper.py | 71 +++++++++- hermes/modules/hermesglobals.py | 33 +++++ hermes/modules/metrics/__init__.py | 0 hermes/modules/recommendergenerator.py | 2 +- hermes/modules/recommenders/__init__.py | 0 hermes/modules/vectorgenerator.py | 121 ++++-------------- hermes/modules/vectors/__init__.py | 0 .../vectors/movielens_vectorgenerator.py | 46 +++++++ .../modules/vectors/wiki_vectorgenerator.py | 38 ++++++ 15 files changed, 226 insertions(+), 114 deletions(-) create mode 100644 docs/assumptions.md delete mode 100644 hermes/modules/globals.py create mode 100644 hermes/modules/hermesglobals.py create mode 100644 hermes/modules/metrics/__init__.py create mode 100644 hermes/modules/recommenders/__init__.py create mode 100644 hermes/modules/vectors/__init__.py create mode 100644 hermes/modules/vectors/movielens_vectorgenerator.py create mode 100644 hermes/modules/vectors/wiki_vectorgenerator.py diff --git a/docs/assumptions.md b/docs/assumptions.md new file mode 100644 index 0000000..cfb79cd --- /dev/null +++ b/docs/assumptions.md @@ -0,0 +1,10 @@ +# Assumptions + +* [Assumptions on Execution](#assumptions-on-execution) +* [Assumptions on Vector Creation](#assumptions-on-vector-creation) + +## Assumptions on Execution + + + +## Assumptions on Vector Creation \ No newline at end of file diff --git a/docs/framework.md b/docs/framework.md index 08c7d94..7aea9bf 100644 --- a/docs/framework.md +++ b/docs/framework.md @@ -69,6 +69,8 @@ Details of what each state does is explained in [hermes.py](#hermespy). Reading this entire article will give you the complete understanding of what the framework does. But if you wanted a TL;DR version, please check out the following: * If you do not know a particular term used in Hermes, please check out the glossary: * [Glossary](https://github.com/Lab41/hermes/tree/master/docs/glossary.md) +* Understand the assumptions made + * [Assumptions](https://github.com/Lab41/hermes/tree/master/docs/assumptions.md) * If you are planning to change the flow of the state machine, please read: * [Adding New States in State Machine](#adding-new-states-in-state-machine) * [Defining a New State](#defining-a-new-state) diff --git a/hermes/hermes.py b/hermes/hermes.py index 6db6120..5347e46 100644 --- a/hermes/hermes.py +++ b/hermes/hermes.py @@ -9,7 +9,7 @@ import modules.recommendergenerator as rg import modules.vectorgenerator as vg -from modules.globals import Globals +from modules.hermesglobals import Globals from modules.timer import Timer # TODO: empty certain items in cargo after no longer needed? diff --git a/hermes/hermesctl.py b/hermes/hermesctl.py index a9c3afd..ff221e2 100644 --- a/hermes/hermesctl.py +++ b/hermes/hermesctl.py @@ -13,7 +13,7 @@ from modules.cargo import Cargo from modules.data import UserVectorData, ContentVectorData -from modules.globals import Globals +from modules.hermesglobals import Globals from modules.singleton import SCSingleton from modules.statemachine import StateMachine diff --git a/hermes/modules/data.py b/hermes/modules/data.py index b2c0c91..47d42bd 100644 --- a/hermes/modules/data.py +++ b/hermes/modules/data.py @@ -1,4 +1,4 @@ -import helper +import helper import vectorgenerator # TODO: avoid this? # TODO: a better way of storing configuration from configuration file? diff --git a/hermes/modules/globals.py b/hermes/modules/globals.py deleted file mode 100644 index c05d95e..0000000 --- a/hermes/modules/globals.py +++ /dev/null @@ -1,11 +0,0 @@ -class Globals(object): - """Globals contains global variables shared by all files. - - Args: - verbose: a boolean variable that prints out debug log messages - logger: logging object that logs messages - scsingleton: Spark Context. There can only be one scsingleton running. - """ - verbose = False - logger = None - scsingleton = None diff --git a/hermes/modules/helper.py b/hermes/modules/helper.py index f8dcf67..6d9a10e 100644 --- a/hermes/modules/helper.py +++ b/hermes/modules/helper.py @@ -1,9 +1,16 @@ """Global helper functions""" -import os +import imp +import importlib +import inspect import json +import md5 +import os +import traceback + from pyspark.sql.types import StructType +from hermesglobals import Globals def is_filepath_valid(filepath): return True if os.path.isfile(filepath) else False @@ -13,3 +20,65 @@ def get_schema(schema_path): return None with open(schema_path, "r") as schema_file: return StructType.fromJson(json.load(schema_file)) + +def load_modules_in_dir(dir_path): + try: + try: + for root, dirs, files in os.walk(dir_path): + for filename in files: + if filename.endswith(".py"): + # current_file == module + thisfilepath = os.path.join(root, filename) + thisfile = open(thisfilepath, "rb") + # use md5.new to generate unique module identifier + # in case there are two modules of the same name + # assumption: no subdirectory within dir_path + module = imp.load_source(md5.new(thisfilepath).hexdigest(), thisfilepath, thisfile) + yield module + thisfile.close() + finally: + try: thisfile.close() + except: pass + except ImportError as err: + Globals.logger.error(err, exc_info=True) + raise + except Exception as err: + Globals.logger.error(err, exc_info=True) + raise + +# return generator of direct descendants +def get_direct_subclasses(module, cls): + try: + for name, obj in inspect.getmembers(module): + # 1. check that obj is a class + if inspect.isclass(obj): + # 2. check that obj is a direct descendant of class + if cls in obj.__bases__: + yield obj + else: + # WARNING: assumption that there is only one class of the same name in all of the modules + for objparent in obj.__bases__: + if objparent.__name__ == cls.__name__: + yield obj + except Exception as err: + Globals.logger.error(err, exc_info=True) + +# return generator of descendants including non-direct ones +def get_non_direct_subclasses(module, cls): + try: + for name, obj in inspect.getmembers(module): + # 1. check that obj is a class + if inspect.isclass(obj): + # 2. check that obj is a direct descendant of class + if issubclass(obj, cls): + yield obj + else: + # WARNING: assumption that there is only one class of the same name in all of the modules + for objparent in obj.__bases__: + if objparent.__name__ == cls.__name__: + yield obj + except Exception as err: + Globals.logger.error(err, exc_info=True) + + + diff --git a/hermes/modules/hermesglobals.py b/hermes/modules/hermesglobals.py new file mode 100644 index 0000000..c6029b4 --- /dev/null +++ b/hermes/modules/hermesglobals.py @@ -0,0 +1,33 @@ +import os + + + + +class Globals(object): + """Globals contains global variables shared by all files. + + Args: + verbose: a boolean variable that prints out debug log messages + logger: logging object that logs messages + scsingleton: Spark Context. There can only be one scsingleton running. + DIR_VECTORS_PATH: a constant string that refers to the directory where vectorgenerators for specific datasets are resided + DIR_RECOMMENDERS_PATH: a constant string that refers to the directory where recommendergenerators for specific recommenders are resided + DIR_METRICS_PATH: a constant string that refers to the directory where metricgenerators for specific metrics are resided + """ + + class Constants(object): + def __init__(self): + self.DIR_VECTORS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "vectors" + self.DIR_RECOMMENDERS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "recommenders" + self.DIR_METRICS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "metrics" + + def __setattr__(self, attr, value): + if hasattr(self, attr): + print("ERROR: cannot reset a constant variable %s = %s" % (attr, value)) + else: + self.__dict__[attr] = value + + verbose = False + logger = None + scsingleton = None + constants = Constants() diff --git a/hermes/modules/metrics/__init__.py b/hermes/modules/metrics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hermes/modules/recommendergenerator.py b/hermes/modules/recommendergenerator.py index ab73681..9dcce3f 100644 --- a/hermes/modules/recommendergenerator.py +++ b/hermes/modules/recommendergenerator.py @@ -21,7 +21,7 @@ import timer import pyspark.mllib.recommendation as mllib -from modules.globals import Globals +from hermesglobals import Globals # ================================================================================ # Background implementation interface diff --git a/hermes/modules/recommenders/__init__.py b/hermes/modules/recommenders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hermes/modules/vectorgenerator.py b/hermes/modules/vectorgenerator.py index ad1e7b7..99387f9 100644 --- a/hermes/modules/vectorgenerator.py +++ b/hermes/modules/vectorgenerator.py @@ -1,29 +1,37 @@ # vector generator == rdd generator +import helper +from hermesglobals import Globals + # ================================================================================ # Vector Factory # ================================================================================ class VectorFactory(object): - def create_vector(self, sqlCtx, data, support_files): vector = data.which_vector - for cls in vector.__subclasses__(): - if cls.isSameDataInstance(data): - return cls(sqlCtx, data, support_files).vector - else: - # cannot find class that builds the data - raise ValueError + # get subclasses that inherit from either UserVector or ContentVector + # from modules in hermes/hermes/modules/vectors directory + for module in helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH): + for subclass in helper.get_direct_subclasses(module, vector): + if subclass.isSameDataInstance(data): + return subclass(sqlCtx, data, support_files).vector + else: + # cannot find class that builds the data + raise ValueError def create_obj_vector(self, sqlCtx, data, support_files): vector = data.which_vector - for cls in vector.__subclasses__(): - if cls.isSameDataInstance(data): - return cls(sqlCtx, data, support_files) - else: - # cannot find class that builds the data - raise ValueError + # get subclasses that inherit from either UserVector or ContentVector + # from modules in hermes/hermes/modules/vectors directory + for module in helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH): + for subclass in helper.get_direct_subclasses(module, vector): + if subclass.isSameDataInstance(data): + return subclass(sqlCtx, data, support_files) + else: + # cannot find class that builds the data + raise ValueError # ================================================================================ # Vector Factory Objects @@ -62,90 +70,7 @@ class ContentVector(Vector): pass # ================================================================================ -# MovieLens -# ================================================================================ - -# TODO: separate in its own file -# TODO: do we need isSameDataInstance()? can we eliminate it? -class MovieLens(object): - @classmethod - def isSameDataInstance(cls, comparisonData): - return comparisonData.dataname == "movielens" - -class MovieLensUserVector(UserVector, MovieLens): - def ratings(self): - return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) - - def pos_ratings(self): - return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)).filter(lambda (u, m, r): r > 3) - - def ratings_to_interact(self): - return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, -1 if row.rating < 3 else 1)) - -class MovieLensContentVector(ContentVector, MovieLens): - def genre(self): - def get_genre(row): - return np.array(( - int(row.genre_action), - int(row.genre_adventure), - int(row.genre_animation), - int(row.genre_childrens), - int(row.genre_comedy), - int(row.genre_crime), - int(row.genre_documentary), - int(row.genre_drama), - int(row.genre_fantasy), - int(row.genre_filmnoir), - int(row.genre_horror), - int(row.genre_musical), - int(row.genre_mystery), - int(row.genre_romance), - int(row.genre_scifi), - int(row.genre_thriller), - int(row.genre_war), - int(row.genre_western), - )) - return self.data.dataframe.map(lambda row: (row.movie_id, get_genre(row))) - -# ================================================================================ -# Wiki -# ================================================================================ - -# TODO: separate in its own file -class Wiki(object): - @classmethod - def isSameDataInstance(cls, comparisonData): - return comparisonData.dataname == "wiki" - -class WikiUserVector(UserVector, Wiki): - def __init__(self): - super(self.__class__, self).__init__() - self.filtered = self.sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") - self.filtered.registerTempTable("wiki_ratings") - - def num_edits(self): - return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id") - - def any_interact(self): - return self.sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings group by user_id, article_id") - - def num_edits_ceil(self): - return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id")\ - .map(lambda (user, article, rating): (user, article, max(rating, 5))) - -class WikiContentVector(ContentVector, Wiki): - def __init__(self): - super(self.__class__, self).__init__() - self.filtered_content = sqlCtx.sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") - self.filtered_content.registerTempTable("wiki_content") - - def glove(self): - raise NotImplemented - - def category_map(self): - raise NotImplemented - -# ================================================================================ -# ADD ADDITIONAL UserVector and ContentVector based on a given data +# User Vector and Content Vector for specific datasetes +# defined in hermes/hermes/modules/vectors # ================================================================================ diff --git a/hermes/modules/vectors/__init__.py b/hermes/modules/vectors/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hermes/modules/vectors/movielens_vectorgenerator.py b/hermes/modules/vectors/movielens_vectorgenerator.py new file mode 100644 index 0000000..429b4cc --- /dev/null +++ b/hermes/modules/vectors/movielens_vectorgenerator.py @@ -0,0 +1,46 @@ +from hermes.modules.vectorgenerator import UserVector, ContentVector + +# ================================================================================ +# MovieLens +# ================================================================================ + +# TODO: do we need isSameDataInstance()? can we eliminate it? +class MovieLens(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.dataname == "movielens" + +class MovieLensUserVector(UserVector, MovieLens): + def ratings(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) + + def pos_ratings(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)).filter(lambda (u, m, r): r > 3) + + def ratings_to_interact(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, -1 if row.rating < 3 else 1)) + +class MovieLensContentVector(ContentVector, MovieLens): + def genre(self): + def get_genre(row): + return np.array(( + int(row.genre_action), + int(row.genre_adventure), + int(row.genre_animation), + int(row.genre_childrens), + int(row.genre_comedy), + int(row.genre_crime), + int(row.genre_documentary), + int(row.genre_drama), + int(row.genre_fantasy), + int(row.genre_filmnoir), + int(row.genre_horror), + int(row.genre_musical), + int(row.genre_mystery), + int(row.genre_romance), + int(row.genre_scifi), + int(row.genre_thriller), + int(row.genre_war), + int(row.genre_western), + )) + return self.data.dataframe.map(lambda row: (row.movie_id, get_genre(row))) diff --git a/hermes/modules/vectors/wiki_vectorgenerator.py b/hermes/modules/vectors/wiki_vectorgenerator.py new file mode 100644 index 0000000..e5c92a0 --- /dev/null +++ b/hermes/modules/vectors/wiki_vectorgenerator.py @@ -0,0 +1,38 @@ +from hermes.modules.vectorgenerator import UserVector, ContentVector + +# ================================================================================ +# Wiki +# ================================================================================ + +class Wiki(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.dataname == "wiki" + +class WikiUserVector(UserVector, Wiki): + def __init__(self): + super(self.__class__, self).__init__() + self.filtered = self.sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") + self.filtered.registerTempTable("wiki_ratings") + + def num_edits(self): + return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id") + + def any_interact(self): + return self.sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings group by user_id, article_id") + + def num_edits_ceil(self): + return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id")\ + .map(lambda (user, article, rating): (user, article, max(rating, 5))) + +class WikiContentVector(ContentVector, Wiki): + def __init__(self): + super(self.__class__, self).__init__() + self.filtered_content = sqlCtx.sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") + self.filtered_content.registerTempTable("wiki_content") + + def glove(self): + raise NotImplemented + + def category_map(self): + raise NotImplemented \ No newline at end of file From 3df20a8a5a3b03eed438c46ed9742ced7c8fe12a Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Wed, 6 Jan 2016 17:00:26 -0800 Subject: [PATCH 20/39] document assumptions --- docs/assumptions.md | 59 +++++++++++++++++++++++++++++++++++++++++- docs/data_supported.md | 4 ++- docs/framework.md | 4 ++- 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/docs/assumptions.md b/docs/assumptions.md index cfb79cd..e7df056 100644 --- a/docs/assumptions.md +++ b/docs/assumptions.md @@ -5,6 +5,63 @@ ## Assumptions on Execution +Here is an example file called config.ini. + +```bash +[datasets] +dataname = movielens + +# user vector +user_vector_data = ["movielens_10m_ratings", "movielens_20m_ratings"] +user_vector_schemas = ["movielens_10m_ratings_schema", "movielens_20m_ratings_schema"] +user_vector_transformations = ["ratings", "ratings_to_interact"] + +# content vector +content_vector_data = ["movielens_10m_movies"] +content_vector_schema = ["movielens_10m_movies_schema"] +content_vector_transformations = ["genre"] + +[recommenders] +user_recommenders = ["ALS"] +content_recommenders = ["CBWithKMeans"] + +[metrics] +metrics = ["RMSE", "MAE"] +``` + +When you specify the following configuration, the assumption that we make during execution is as follows: +* each transformation is applied in sequential order to the data, meaning + * user_vector_transformation "ratings" is applied to "movielens_10m_ratings" and "movielens_10m_ratings_schema" + * user_vector_transformation "ratings_to_interact" is applied to "movielens_20m_ratings" and "movielens_20m_ratings_schema" + * content_vector_transformation "genre" is applied to "movielens_10m_movies" and "movielens10m_movies_schema" +* user_recommenders take in a list of recommender algorithms that will be applied to all user_vector_data, meaning + * apply ALS to a User Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings" + * apply ALS to a User Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings_to_interact" +* content_recommenders take in a list of recommender algorithms that will be applied to all content_vector_data, meaning + * apply CBWithKMeans to a Content Vector of movielens_10m_movies that have been transformed by vector transformation "genre" +* metrics take in a list of metrics that will be applied to all data, including both user_vector_data and content_vector_data, after recommender algorithms have been applied to them, meaning + * apply RMSE to a User Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings" and recommendation system algorithm ALS + * apply RMSE to a USer Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings_to_interact" and recommedation systme algorithm ALS + * apply RMSE to a Content Vector of movielens_10m_movies that have been transformed by vector transformation "genre" and recommendationi system algorithm CBWithKMeans + * apply MAE to a User Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings" and recommendation system algorithm ALS + * apply MAE to a USer Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings_to_interact" and recommedation systme algorithm ALS + * apply MAE to a Content Vector of movielens_10m_movies that have been transformed by vector transformation "genre" and recommendationi system algorithm CBWithKMeans + +## Assumptions on Vector Creation + +Each dataset is unique in that transforming JSON to RDD is different for each dataset. This step is implemented in vectorgenerator.py. When we separate the implementation of vector generation of each dataset into individual files in the hermes/hermes/modules/vectors directory, each of these files need to import vectorgenerator.py in this specific manner: + +```bash +from hermes.modules.vectorgenerator import UserVector, ContentVector +``` + +The reason for this is during the instantiation of the vector object in the VectorFactory class. When we specify which vector to create, it is either a UserVector or a ContentVector class; both of which are instantiated in vectorgenerator.py, and vectorgenerator.py as a module is hermes.modules.vectorgenerator. + +Since we can no longer use the __subclasses__() function to iterate through all children of UserVector class or all children of ContentVector class in order to instantiate the right vector because the children are now defined in a separate module in hermes/hermes/modules/vectors directory, we have to load all modules and go through each class in each module to know all children of a UserVector or ContentVector class. Unfortunately, if you defined the import statement as "from modules.vectorgenerator" instead of "from hermes.modules.vectorgenerator", it does not think the two modules are the same even though they are. + +We have yet to determine why this is the case. + +When users add a new dataset, we cannot always assume that they will import exactly as "from hermes.modules.vectorgenerator import UserVector, ContentVector" because they can import it as "from modules.vectorgenerator import UserVector, ContentVector" since it is valid. For this reason, we have made an assumption that if the parent class of the MovieLensUserVector, for example, has the __name__ UserVector, MovieLensUserVector is the child of UserVector. The problem of this assummption is that if MovieLensUserVector inherits multiple parents from different module with the same class name, it can become a problem as it will treat both parents with the same class name as the same. + -## Assumptions on Vector Creation \ No newline at end of file diff --git a/docs/data_supported.md b/docs/data_supported.md index b0fb332..295a49f 100644 --- a/docs/data_supported.md +++ b/docs/data_supported.md @@ -76,11 +76,13 @@ You can specify the vector transformation on a content vector by specifying cont This excerpt is taken out from [Understanding Hermes's Framework](https://github.com/Lab41/hermes/tree/master/docs/framework.md#adding-new-datasets). -Currently, adding new dataset will require you to append the logic (see template below) in hermes/hermes/modules/vectorgenerator.py. To make it easier for the user, in the future, every time you add a new dataset, you will need to create a new file. The template for supporting an additional dataset is shown below. +Every time you add a new dataset, you will need to create a new file in hermes/hermes/modules/vectors. The template for supporting an additional dataset is shown below.The template for supporting an additional dataset is shown below. Template: ```bash +from hermes.modules.vectorgenerator import UserVector, ContentVector + class NewDataset(object): @classmethod def isSameDataInstance(cls, comparisonData): diff --git a/docs/framework.md b/docs/framework.md index 7aea9bf..ee62f36 100644 --- a/docs/framework.md +++ b/docs/framework.md @@ -352,11 +352,13 @@ class MyNewVectorType(Vector): Same explanation can be found in [Datasets Supported's section on Adding New Datasets](https://github.com/Lab41/hermes/tree/master/docs/data_supported.md#adding-new-datasets). -Currently, adding new dataset will require you to append the logic (see template below) in hermes/hermes/modules/vectorgenerator.py. To make it easier for the user, in the future, every time you add a new dataset, you will need to create a new file. The template for supporting an additional dataset is shown below. +Every time you add a new dataset, you will need to create a new file in hermes/hermes/modules/vectors. The template for supporting an additional dataset is shown below. Template: ```bash +from hermes.modules.vectorgenerator import UserVector, ContentVector + class NewDataset(object): @classmethod def isSameDataInstance(cls, comparisonData): From 6257af94903174284771ca4614b6d30c239ba0ec Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Thu, 7 Jan 2016 13:31:02 -0800 Subject: [PATCH 21/39] separate recommender generation of different use cases in separate files --- hermes/modules/hermesglobals.py | 6 +-- hermes/modules/{metrics => mg}/__init__.py | 0 hermes/modules/recommendergenerator.py | 48 ++----------------- .../modules/{recommenders => rg}/__init__.py | 0 hermes/modules/rg/default_rg.py | 14 ++++++ hermes/modules/rg/interface.py | 10 ++++ hermes/modules/rg/with_tfidf_rg.py | 8 ++++ hermes/modules/rg/without_tfidf_rg.py | 8 ++++ hermes/modules/{vectors => vg}/__init__.py | 0 .../movielens_vg.py} | 0 .../wiki_vectorgenerator.py => vg/wiki_vg.py} | 0 11 files changed, 48 insertions(+), 46 deletions(-) rename hermes/modules/{metrics => mg}/__init__.py (100%) rename hermes/modules/{recommenders => rg}/__init__.py (100%) create mode 100644 hermes/modules/rg/default_rg.py create mode 100644 hermes/modules/rg/interface.py create mode 100644 hermes/modules/rg/with_tfidf_rg.py create mode 100644 hermes/modules/rg/without_tfidf_rg.py rename hermes/modules/{vectors => vg}/__init__.py (100%) rename hermes/modules/{vectors/movielens_vectorgenerator.py => vg/movielens_vg.py} (100%) rename hermes/modules/{vectors/wiki_vectorgenerator.py => vg/wiki_vg.py} (100%) diff --git a/hermes/modules/hermesglobals.py b/hermes/modules/hermesglobals.py index c6029b4..56964c7 100644 --- a/hermes/modules/hermesglobals.py +++ b/hermes/modules/hermesglobals.py @@ -17,9 +17,9 @@ class Globals(object): class Constants(object): def __init__(self): - self.DIR_VECTORS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "vectors" - self.DIR_RECOMMENDERS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "recommenders" - self.DIR_METRICS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "metrics" + self.DIR_VECTORS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "vg" + self.DIR_RECOMMENDERS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "rg" + self.DIR_METRICS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "mg" def __setattr__(self, attr, value): if hasattr(self, attr): diff --git a/hermes/modules/metrics/__init__.py b/hermes/modules/mg/__init__.py similarity index 100% rename from hermes/modules/metrics/__init__.py rename to hermes/modules/mg/__init__.py diff --git a/hermes/modules/recommendergenerator.py b/hermes/modules/recommendergenerator.py index 9dcce3f..bf75e2d 100644 --- a/hermes/modules/recommendergenerator.py +++ b/hermes/modules/recommendergenerator.py @@ -19,56 +19,18 @@ import sys import timer -import pyspark.mllib.recommendation as mllib -from hermesglobals import Globals - -# ================================================================================ -# Background implementation interface -# ================================================================================ - -class ImplementationInterface(object): - def make_prediction_with_als(self): - raise NotImplemented - - def make_prediction_with_cbkmeans(self): - raise NotImplemented +import helper - -# ================================================================================ -# Concrete background implementations -# ================================================================================ - -# TODO: ask Anna for the specifics -class WithTfidf(ImplementationInterface): - """ - # TODO - def make_prediction_with_cbkmeans(self, vector): - # create CB with K-means with tf-idf - raise NotImplemented - """ - -class WithoutTfidf(ImplementationInterface): - def make_prediction_with_als(self, vector): - # create ALS model without tf-idf - # TODO: specify rank based on what the user wants - model = mllib.ALS.train(vector.training_vector, rank=3) - prediction_vector = model.predictAll( vector.test_vector.map( lambda x: (x[0], x[1]) ) ).cache() - return prediction_vector - - """ - # TODO - def make_prediction_with_cbkmeans(self, vector): - # create CB with K-means without tf-idf - raise NotImplemented - """ +from hermesglobals import Globals +from rg.default_rg import Default # ================================================================================ # Bridge: bridge target interface & background implementation # ================================================================================ class Recommender(object): - def __init__(self, vector, implementation=WithoutTfidf()): + def __init__(self, vector, implementation=Default()): self.vector = vector self.implementation = implementation @@ -81,7 +43,7 @@ def make_prediction(self): # ================================================================================ class RecommenderFactory(object): - def create_obj_recommender(self, recommender_str, vector, implementation=WithoutTfidf()): + def create_obj_recommender(self, recommender_str, vector, implementation=Default()): which_recommender = getattr(sys.modules[__name__], recommender_str) if not which_recommender: # cannot find class diff --git a/hermes/modules/recommenders/__init__.py b/hermes/modules/rg/__init__.py similarity index 100% rename from hermes/modules/recommenders/__init__.py rename to hermes/modules/rg/__init__.py diff --git a/hermes/modules/rg/default_rg.py b/hermes/modules/rg/default_rg.py new file mode 100644 index 0000000..72efa1c --- /dev/null +++ b/hermes/modules/rg/default_rg.py @@ -0,0 +1,14 @@ +from interface import ImplementationInterface + +import pyspark.mllib.recommendation as mllib + +# ================================================================================ +# Concrete background implementations +# ================================================================================ + +class Default(ImplementationInterface): + def make_prediction_with_als(self, vector): + # TODO: specify rank based on what the user wants + model = mllib.ALS.train(vector.training_vector, rank=3) + prediction_vector = model.predictAll( vector.test_vector.map( lambda x: (x[0], x[1]) ) ).cache() + return prediction_vector \ No newline at end of file diff --git a/hermes/modules/rg/interface.py b/hermes/modules/rg/interface.py new file mode 100644 index 0000000..518399d --- /dev/null +++ b/hermes/modules/rg/interface.py @@ -0,0 +1,10 @@ +# ================================================================================ +# Background implementation interface +# ================================================================================ + +class ImplementationInterface(object): + def make_prediction_with_als(self): + raise NotImplemented + + def make_prediction_with_cbkmeans(self): + raise NotImplemented \ No newline at end of file diff --git a/hermes/modules/rg/with_tfidf_rg.py b/hermes/modules/rg/with_tfidf_rg.py new file mode 100644 index 0000000..3c1e251 --- /dev/null +++ b/hermes/modules/rg/with_tfidf_rg.py @@ -0,0 +1,8 @@ +from interface import ImplementationInterface + +# ================================================================================ +# Concrete background implementations +# ================================================================================ + +class WithTfidf(ImplementationInterface): + pass \ No newline at end of file diff --git a/hermes/modules/rg/without_tfidf_rg.py b/hermes/modules/rg/without_tfidf_rg.py new file mode 100644 index 0000000..ab4f23d --- /dev/null +++ b/hermes/modules/rg/without_tfidf_rg.py @@ -0,0 +1,8 @@ +from interface import ImplementationInterface + +# ================================================================================ +# Concrete background implementations +# ================================================================================ + +class WithoutTfidf(ImplementationInterface): + pass \ No newline at end of file diff --git a/hermes/modules/vectors/__init__.py b/hermes/modules/vg/__init__.py similarity index 100% rename from hermes/modules/vectors/__init__.py rename to hermes/modules/vg/__init__.py diff --git a/hermes/modules/vectors/movielens_vectorgenerator.py b/hermes/modules/vg/movielens_vg.py similarity index 100% rename from hermes/modules/vectors/movielens_vectorgenerator.py rename to hermes/modules/vg/movielens_vg.py diff --git a/hermes/modules/vectors/wiki_vectorgenerator.py b/hermes/modules/vg/wiki_vg.py similarity index 100% rename from hermes/modules/vectors/wiki_vectorgenerator.py rename to hermes/modules/vg/wiki_vg.py From 67e8cd1f3772cc341bd2fe4de54faa1fde92669f Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Thu, 7 Jan 2016 13:55:45 -0800 Subject: [PATCH 22/39] rename files to provide more clarity --- hermes/modules/recommendergenerator.py | 2 +- hermes/modules/rg/{default_rg.py => default_usecase.py} | 2 +- hermes/modules/rg/{with_tfidf_rg.py => with_tfidf_usecase.py} | 2 +- .../rg/{without_tfidf_rg.py => without_tfidf_usecase.py} | 2 +- .../vg/{movielens_vg.py => movielens_vectorgenerator.py} | 0 hermes/modules/vg/{wiki_vg.py => wiki_vectorgenerator.py} | 0 6 files changed, 4 insertions(+), 4 deletions(-) rename hermes/modules/rg/{default_rg.py => default_usecase.py} (91%) rename hermes/modules/rg/{with_tfidf_rg.py => with_tfidf_usecase.py} (80%) rename hermes/modules/rg/{without_tfidf_rg.py => without_tfidf_usecase.py} (79%) rename hermes/modules/vg/{movielens_vg.py => movielens_vectorgenerator.py} (100%) rename hermes/modules/vg/{wiki_vg.py => wiki_vectorgenerator.py} (100%) diff --git a/hermes/modules/recommendergenerator.py b/hermes/modules/recommendergenerator.py index bf75e2d..fed8697 100644 --- a/hermes/modules/recommendergenerator.py +++ b/hermes/modules/recommendergenerator.py @@ -23,7 +23,7 @@ import helper from hermesglobals import Globals -from rg.default_rg import Default +from rg.default_usecase import Default # ================================================================================ # Bridge: bridge target interface & background implementation diff --git a/hermes/modules/rg/default_rg.py b/hermes/modules/rg/default_usecase.py similarity index 91% rename from hermes/modules/rg/default_rg.py rename to hermes/modules/rg/default_usecase.py index 72efa1c..f63dca1 100644 --- a/hermes/modules/rg/default_rg.py +++ b/hermes/modules/rg/default_usecase.py @@ -3,7 +3,7 @@ import pyspark.mllib.recommendation as mllib # ================================================================================ -# Concrete background implementations +# Concrete background implementations for default use cases # ================================================================================ class Default(ImplementationInterface): diff --git a/hermes/modules/rg/with_tfidf_rg.py b/hermes/modules/rg/with_tfidf_usecase.py similarity index 80% rename from hermes/modules/rg/with_tfidf_rg.py rename to hermes/modules/rg/with_tfidf_usecase.py index 3c1e251..d051958 100644 --- a/hermes/modules/rg/with_tfidf_rg.py +++ b/hermes/modules/rg/with_tfidf_usecase.py @@ -1,7 +1,7 @@ from interface import ImplementationInterface # ================================================================================ -# Concrete background implementations +# Concrete background implementations for use cases with tf-idf # ================================================================================ class WithTfidf(ImplementationInterface): diff --git a/hermes/modules/rg/without_tfidf_rg.py b/hermes/modules/rg/without_tfidf_usecase.py similarity index 79% rename from hermes/modules/rg/without_tfidf_rg.py rename to hermes/modules/rg/without_tfidf_usecase.py index ab4f23d..2b3ebb1 100644 --- a/hermes/modules/rg/without_tfidf_rg.py +++ b/hermes/modules/rg/without_tfidf_usecase.py @@ -1,7 +1,7 @@ from interface import ImplementationInterface # ================================================================================ -# Concrete background implementations +# Concrete background implementations for use cases without tf-idf # ================================================================================ class WithoutTfidf(ImplementationInterface): diff --git a/hermes/modules/vg/movielens_vg.py b/hermes/modules/vg/movielens_vectorgenerator.py similarity index 100% rename from hermes/modules/vg/movielens_vg.py rename to hermes/modules/vg/movielens_vectorgenerator.py diff --git a/hermes/modules/vg/wiki_vg.py b/hermes/modules/vg/wiki_vectorgenerator.py similarity index 100% rename from hermes/modules/vg/wiki_vg.py rename to hermes/modules/vg/wiki_vectorgenerator.py From 19e3c542e88ba8ec61f033f3fa00b734e87e56d7 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Thu, 7 Jan 2016 15:00:52 -0800 Subject: [PATCH 23/39] config differentiates recommenders based on the vector types --- hermes/configs/movielens_config.ini | 3 +-- hermes/hermes.py | 28 ++++++++++++++++++++++------ hermes/hermesctl.py | 9 ++++++--- hermes/modules/cargo.py | 4 +++- hermes/modules/config.py | 19 ++++++++++++++----- hermes/modules/helper.py | 21 +++++++++++++++++++++ 6 files changed, 67 insertions(+), 17 deletions(-) diff --git a/hermes/configs/movielens_config.ini b/hermes/configs/movielens_config.ini index 8b83aae..c55ea3b 100644 --- a/hermes/configs/movielens_config.ini +++ b/hermes/configs/movielens_config.ini @@ -8,8 +8,7 @@ user_vector_transformations = ["ratings"] #content_vector_transformations = ["genre"] [recommenders] -recommenders = ["ALS"] -#user_recommenders = ["ALS"] +user_recommenders = ["ALS"] #content_recommenders = [""] diff --git a/hermes/hermes.py b/hermes/hermes.py index 5347e46..f8f1dfd 100644 --- a/hermes/hermes.py +++ b/hermes/hermes.py @@ -5,6 +5,7 @@ import os import hermesui +import modules.helper as helper import modules.metricgenerator as mg import modules.recommendergenerator as rg import modules.vectorgenerator as vg @@ -95,16 +96,31 @@ def make_prediction_state(cargo): if Globals.verbose: Globals.logger.debug("In make_prediction_state:") for i in range(0, len(cargo.vectors)): - for r in cargo.recommenders: - if Globals.verbose: Globals.logger.debug("Making recommendation %s on data %s", r, cargo.vectors[i].data.datapath) - # TODO: implement other implementations, ie. WithTfidf(), etc. - # default is WithoutTfidf() - recommender = rg.RecommenderFactory().create_obj_recommender(r, cargo.vectors[i]) + thisvector = cargo.vectors[i] + + # select which recommenders based on the vector type + recommenders = None + if helper.is_direct_subclass(thisvector, vg.UserVector): + if Globals.verbose: Globals.logger.debug("Iterating through recommenders for user vector on data %s", thisvector.data.datapath) + recommenders = cargo.user_recommenders + elif helper.is_direct_subclass(thisvector, vg.ContentVector): + if Globals.verbose: Globals.logger.debug("Iterating through recommenders for content vector on data %s", thisvector.data.datapath) + recommenders = cargo.content_recommenders + + # run all recommenders on the vector + for r in recommenders: + if Globals.verbose: Globals.logger.debug("Making recommendation %s on data %s", r, thisvector.data.datapath) + # TODO: implement other use case, ie. WithTfidf(), etc. + recommender = rg.RecommenderFactory().create_obj_recommender(r, thisvector) + # default use case + # recommender = RecommenderFactory().create_obj_recommender(r, vector, Default()) + # with tf-idf use case # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithTfidf()) + # without tf-idf use case # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithoutTfidf()) # etc. with Timer() as t: - cargo.vectors[i].prediction_vector = recommender.make_prediction() + thisvector.prediction_vector = recommender.make_prediction() if Globals.verbose: Globals.logger.debug("Making prediction takes %s seconds" % t.secs) newState = calculate_metrics_state diff --git a/hermes/hermesctl.py b/hermes/hermesctl.py index ff221e2..50bbc3e 100644 --- a/hermes/hermesctl.py +++ b/hermes/hermesctl.py @@ -107,9 +107,12 @@ def handle_recognized_section_item(section, item_key, item_value): # [datasets] items will be placed into cargo in handle_dataset_section() return if section == "recommenders": - if item_key == "recommenders": - # add list of recommenders into cargo - cargo.recommenders.extend( json.loads(item_value) ) + if item_key == "user_recommenders": + # add list of recommenders for user vectors into cargo + cargo.user_recommenders.extend( json.loads(item_value) ) + elif item_key == "content_recommenders": + # add list of recommenders for content vectors into cargo + cargo.content_recommenders.extend( json.loads(item_value) ) return if section == "metrics": if item_key == "metrics": diff --git a/hermes/modules/cargo.py b/hermes/modules/cargo.py index 86c9af1..adc6330 100644 --- a/hermes/modules/cargo.py +++ b/hermes/modules/cargo.py @@ -22,7 +22,9 @@ def __init__(self): self.datas = [] # used until json_to_rdd_state self.vectors = [] # used until develop_model_state self.support_files = {} - self.recommenders = [] + # TODO: clean up so that there is only recommenders...and not user_recommenders & content_recommenders + self.user_recommenders = [] + self.content_recommenders = [] self.metrics = [] self.error_msg = "" diff --git a/hermes/modules/config.py b/hermes/modules/config.py index 102087c..eeaf6c5 100644 --- a/hermes/modules/config.py +++ b/hermes/modules/config.py @@ -1,16 +1,25 @@ # recognized sections and their items +# 1. datasets section REQ_UV_HEADINGS = ("user_vector_data", "user_vector_transformations") -UV_HEADINGS = () + REQ_UV_HEADINGS + ("user_vector_schemas",) +OPT_UV_HEADINGS = ("user_vector_schemas",) +UV_HEADINGS = () + REQ_UV_HEADINGS + OPT_UV_HEADINGS REQ_CV_HEADINGS = ("content_vector_data", "content_vector_transformations") -CV_HEADINGS = () + REQ_CV_HEADINGS + ("content_vector_schemas",) +OPT_CV_HEADINGS = ("content_vector_schemas",) +CV_HEADINGS = () + REQ_CV_HEADINGS + OPT_CV_HEADINGS DATASETS_HEADINGS = ("dataname",) + UV_HEADINGS + CV_HEADINGS +# 2. recommenders section +RECOMMENDERS_HEADINGS = ("user_recommenders", "content_recommenders") + +# 3. metrics section +METRICS_HEADINGS = ("metrics") + HEADINGS = { "datasets": DATASETS_HEADINGS, \ - "recommenders": ("recommenders"), \ - "metrics": ("metrics") \ - } + "recommenders": RECOMMENDERS_HEADINGS, \ + "metrics": METRICS_HEADINGS \ + } def map_section(config_parser, section): """ Map a section with the given section name and return a dictionary of the section. diff --git a/hermes/modules/helper.py b/hermes/modules/helper.py index 6d9a10e..6c74ced 100644 --- a/hermes/modules/helper.py +++ b/hermes/modules/helper.py @@ -46,6 +46,27 @@ def load_modules_in_dir(dir_path): Globals.logger.error(err, exc_info=True) raise +# check whether checkcls is the cls or direct subclass of cls +def is_direct_subclass(obj, cls): + # 1. make sure that checkcls is a class object + checkcls = obj + if not inspect.isclass(obj): + checkcls = obj.__class__ + # 2. check if checkcls == cls; if it is, return True + # 3. check if cls is a direct parent of checkcls + return type(checkcls) == type(cls) or cls in checkcls.__bases__ + +# check whether checkcls it the cls or non-direct subclass of cls +def is_non_direct_subclass(checkcls, cls): + # 1. make sure that checkcls is a class object + checkcls = obj + if not inspect.isclass(obj): + checkcls = obj.__class__ + # 2. check if checkcls == cls; if it is, return True + # 3. check if checkcls ia subclass of cls + return type(checkcls) == type(cls) or issubclass(checkcls, cls) + + # return generator of direct descendants def get_direct_subclasses(module, cls): try: From e68346747ee9e87b07f67d617b890414da305bb9 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Tue, 12 Jan 2016 10:34:22 -0800 Subject: [PATCH 24/39] import submodules in __init__.py --- hermes/__init__.py | 8 +++++++- hermes/metrics/__init__.py | 2 ++ hermes/modules/__init__.py | 14 ++++++++++++++ hermes/modules/rg/__init__.py | 4 ++++ hermes/modules/vg/__init__.py | 2 ++ 5 files changed, 29 insertions(+), 1 deletion(-) diff --git a/hermes/__init__.py b/hermes/__init__.py index e43be03..fb408e4 100644 --- a/hermes/__init__.py +++ b/hermes/__init__.py @@ -1 +1,7 @@ -__version__ = '1.0' \ No newline at end of file +__version__ = '1.0' +import hermes +import hermesctl +import hermesui +import modules +import metrics +import utils \ No newline at end of file diff --git a/hermes/metrics/__init__.py b/hermes/metrics/__init__.py index e69de29..70fb85c 100644 --- a/hermes/metrics/__init__.py +++ b/hermes/metrics/__init__.py @@ -0,0 +1,2 @@ +import content_based +import performance_metrics \ No newline at end of file diff --git a/hermes/modules/__init__.py b/hermes/modules/__init__.py index e69de29..ea3399c 100644 --- a/hermes/modules/__init__.py +++ b/hermes/modules/__init__.py @@ -0,0 +1,14 @@ +import cargo +import config +import data +import helper +import hermesglobals +import metricgenerator +import recommendergenerator +import singleton +import statemachine +import timer +import vectorgenerator +import vg +import rg +import mg \ No newline at end of file diff --git a/hermes/modules/rg/__init__.py b/hermes/modules/rg/__init__.py index e69de29..ae4a8ac 100644 --- a/hermes/modules/rg/__init__.py +++ b/hermes/modules/rg/__init__.py @@ -0,0 +1,4 @@ +import default_usecase +import interface +import with_tfidf_usecase +import without_tfidf_usecase \ No newline at end of file diff --git a/hermes/modules/vg/__init__.py b/hermes/modules/vg/__init__.py index e69de29..ed2e0d4 100644 --- a/hermes/modules/vg/__init__.py +++ b/hermes/modules/vg/__init__.py @@ -0,0 +1,2 @@ +import movielens_vectorgenerator +import wiki_vectorgenerator \ No newline at end of file From 2dd2421e22649ec9730f474fa94378ff6d589df6 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Tue, 12 Jan 2016 12:10:39 -0800 Subject: [PATCH 25/39] remove sqlCtx in vectorgenerator because sqlCtx is a global variable --- hermes/configs/movielens_config.ini | 2 + hermes/hermes.py | 72 ++++++++++++------- hermes/modules/vectorgenerator.py | 35 +++++---- hermes/modules/vg/__init__.py | 2 - .../modules/vg/movielens_vectorgenerator.py | 3 +- hermes/modules/vg/wiki_vectorgenerator.py | 13 ++-- 6 files changed, 79 insertions(+), 48 deletions(-) diff --git a/hermes/configs/movielens_config.ini b/hermes/configs/movielens_config.ini index c55ea3b..4a76a87 100644 --- a/hermes/configs/movielens_config.ini +++ b/hermes/configs/movielens_config.ini @@ -1,8 +1,10 @@ [datasets] dataname = movielens +# user vectors user_vector_data = ["movielens_10m_ratings"] user_vector_schemas = ["movielens_10m_ratings_schema"] user_vector_transformations = ["ratings"] +# content vectors #content_vector_data = ["movielens_10m_movies"] #content_vector_schemas = ["movielens_10m_movies_schema"] #content_vector_transformations = ["genre"] diff --git a/hermes/hermes.py b/hermes/hermes.py index f8f1dfd..f219a69 100644 --- a/hermes/hermes.py +++ b/hermes/hermes.py @@ -16,10 +16,8 @@ # TODO: empty certain items in cargo after no longer needed? # TODO: when to use error_state? do try-catch for all states? -def start_state(cargo): - """Start of the state machine. Create HDFS directory and upload the input data. - Returns: json_to_rdd_state as next state - """ +def __start(cargo): + """start_state without the state machine.""" if Globals.verbose: Globals.logger.debug("In start_state:") @@ -34,16 +32,21 @@ def load_json_files(datas): load_json_files(cargo.datas) +def start_state(cargo): + """Start of the state machine. Create HDFS directory and upload the input data. + Returns: json_to_rdd_state as next state + """ + + __start(cargo) + newState = json_to_rdd_state if Globals.verbose: Globals.logger.debug("start_state -> json_to_rdd_state") return newState, cargo # TODO: make json_to_rdd_state, split_data_state, and make_prediction_state into one state? -def json_to_rdd_state(cargo): - """Parse JSON to RDD. - Returns: split_data_state as next state - """ +def __json_to_rdd(cargo): + """json_to_rdd_state without the state macine.""" if Globals.verbose: Globals.logger.debug("In json_to_rdd_state:") @@ -57,24 +60,27 @@ def json_to_rdd_state(cargo): data.set_dataframe(Globals.scsingleton, datapath_in_hdfs) if Globals.verbose: Globals.logger.debug("Creating RDD based on the computed dataframe and configuration provided by the user") - # TODO: remove sqlCtx since it's global? - cargo.vectors.append( vg.VectorFactory().create_obj_vector(Globals.scsingleton.sqlCtx, data, cargo.support_files) ) - + cargo.vectors.append( vg.VectorFactory().create_obj_vector(data, cargo.support_files) ) # TODO: clean cargo? # cargo.datas = [] # cargo.hdfs_dir = None # cargo.fs_default_ip_addr = None +def json_to_rdd_state(cargo): + """Parse JSON to RDD. + Returns: split_data_state as next state + """ + + __json_to_rdd(cargo) + newState = split_data_state if Globals.verbose: Globals.logger.debug("json_to_rdd_state -> split_data_state") return newState, cargo -def split_data_state(cargo): - """Split data to train, test, and (optional) validate. - Returns: make_prediction_state as next state - """ +def __split_data(cargo): + """split_data_state without the state machine.""" if Globals.verbose: Globals.logger.debug("In split_data_state:") @@ -83,15 +89,20 @@ def split_data_state(cargo): weights, seed = hermesui._ask_user_for_split_percentage(vector.data.datapath) vector.split_data(weights, seed) +def split_data_state(cargo): + """Split data to train, test, and (optional) validate. + Returns: make_prediction_state as next state + """ + + __split_data(cargo) + newState = make_prediction_state if Globals.verbose: Globals.logger.debug("split_data_state -> make_prediction_state") return newState, cargo -def make_prediction_state(cargo): - """Develop model based on the train data and make prediction based on this model. - Returns: calculate_metrics_state as next state - """ +def __make_prediction(cargo): + """make_prediction_state without the state machine.""" if Globals.verbose: Globals.logger.debug("In make_prediction_state:") @@ -123,22 +134,26 @@ def make_prediction_state(cargo): thisvector.prediction_vector = recommender.make_prediction() if Globals.verbose: Globals.logger.debug("Making prediction takes %s seconds" % t.secs) +def make_prediction_state(cargo): + """Develop model based on the train data and make prediction based on this model. + Returns: calculate_metrics_state as next state + """ + + __make_prediction(cargo) + newState = calculate_metrics_state if Globals.verbose: Globals.logger.debug("make_prediction_state -> calculate_metrics_state") return newState, cargo -def calculate_metrics_state(cargo): - """Test the metrics specified by the user. This is an end state. - Returns: None because this is the last state - """ +def __calculate_metrics(cargo): + """calculate_metrics_state without the state machine.""" if Globals.verbose: Globals.logger.debug("In calculate_metrics_state:") # create a metric executor executor = mg.MetricExecutor(mg.Metric()) - # TODO: figure out why logger prints INFO twice for i in range(0, len(cargo.vectors)): Globals.logger.info("-" * 80) Globals.logger.info("Data: %s" % cargo.vectors[i].data.datapath) @@ -152,6 +167,14 @@ def calculate_metrics_state(cargo): Globals.logger.info("Metric: %s = %f" % (m, executor.execute(cargo.vectors[i]))) if Globals.verbose: Globals.logger.debug("Calculating metric takes %s seconds" % t.secs) Globals.logger.info("-" * 80) + +def calculate_metrics_state(cargo): + """Test the metrics specified by the user. This is an end state. + Returns: None because this is the last state + """ + + __calculate_metrics(cargo) + if Globals.verbose: Globals.logger.debug("calculate_metrics_state -> end_state") return @@ -160,6 +183,7 @@ def error_state(cargo): """Error state. Print out the error messages. This is an end state. Returns: None because this is the last state """ + if Globals.verbose: Globals.logger.debug("In error_state:") Globals.logger.error("ERROR: " + cargo.error_msg) if Globals.verbose: Globals.logger.debug("error_state -> end_state") diff --git a/hermes/modules/vectorgenerator.py b/hermes/modules/vectorgenerator.py index 99387f9..ef6b660 100644 --- a/hermes/modules/vectorgenerator.py +++ b/hermes/modules/vectorgenerator.py @@ -9,26 +9,26 @@ # ================================================================================ class VectorFactory(object): - def create_vector(self, sqlCtx, data, support_files): + def create_vector(self, data, support_files): vector = data.which_vector # get subclasses that inherit from either UserVector or ContentVector # from modules in hermes/hermes/modules/vectors directory for module in helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH): for subclass in helper.get_direct_subclasses(module, vector): if subclass.isSameDataInstance(data): - return subclass(sqlCtx, data, support_files).vector + return subclass(data, support_files).vector else: # cannot find class that builds the data raise ValueError - def create_obj_vector(self, sqlCtx, data, support_files): + def create_obj_vector(self, data, support_files): vector = data.which_vector # get subclasses that inherit from either UserVector or ContentVector # from modules in hermes/hermes/modules/vectors directory for module in helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH): for subclass in helper.get_direct_subclasses(module, vector): if subclass.isSameDataInstance(data): - return subclass(sqlCtx, data, support_files) + return subclass(data, support_files) else: # cannot find class that builds the data raise ValueError @@ -38,9 +38,7 @@ def create_obj_vector(self, sqlCtx, data, support_files): # ================================================================================ class Vector(object): - def __init__(self, sqlCtx, data, support_files): - # TODO: remove sqlCtx because it is global? - self.sqlCtx = sqlCtx + def __init__(self, data, support_files): self.data = data self.support_files = support_files vector_transformation = getattr(self, data.vector_transformation) @@ -48,6 +46,17 @@ def __init__(self, sqlCtx, data, support_files): self.vector = None else: self.vector = vector_transformation() + + def split_data(self, weights, seed): + raise NotImplemented + +# ================================================================================ +# User Vector and Content Vector Factory Objects +# ================================================================================ + +class UserVector(Vector): + def __init__(self, data, support_files): + super(UserVector, self).__init__(data, support_files) self.training_vector = None self.test_vector = None self.validation_vector = None @@ -59,15 +68,11 @@ def split_data(self, weights, seed): self.test_vector = test_vector self.validation_vector = validation_vector -# ================================================================================ -# User Vector and Content Vector Factory Objects -# ================================================================================ - -class UserVector(Vector): - pass - class ContentVector(Vector): - pass + def __init__(self, data, support_files, user_vector): + super(ContentVector, self)._init__(data, support_files) + self.user_vector = user_vector + # ================================================================================ # User Vector and Content Vector for specific datasetes diff --git a/hermes/modules/vg/__init__.py b/hermes/modules/vg/__init__.py index ed2e0d4..e69de29 100644 --- a/hermes/modules/vg/__init__.py +++ b/hermes/modules/vg/__init__.py @@ -1,2 +0,0 @@ -import movielens_vectorgenerator -import wiki_vectorgenerator \ No newline at end of file diff --git a/hermes/modules/vg/movielens_vectorgenerator.py b/hermes/modules/vg/movielens_vectorgenerator.py index 429b4cc..e4a12f6 100644 --- a/hermes/modules/vg/movielens_vectorgenerator.py +++ b/hermes/modules/vg/movielens_vectorgenerator.py @@ -1,4 +1,5 @@ -from hermes.modules.vectorgenerator import UserVector, ContentVector +from modules.vectorgenerator import UserVector, ContentVector +from modules.hermesglobals import Globals # ================================================================================ # MovieLens diff --git a/hermes/modules/vg/wiki_vectorgenerator.py b/hermes/modules/vg/wiki_vectorgenerator.py index e5c92a0..48ab958 100644 --- a/hermes/modules/vg/wiki_vectorgenerator.py +++ b/hermes/modules/vg/wiki_vectorgenerator.py @@ -1,4 +1,5 @@ -from hermes.modules.vectorgenerator import UserVector, ContentVector +from modules.vectorgenerator import UserVector, ContentVector +from modules.hermesglobals import Globals # ================================================================================ # Wiki @@ -12,23 +13,23 @@ def isSameDataInstance(cls, comparisonData): class WikiUserVector(UserVector, Wiki): def __init__(self): super(self.__class__, self).__init__() - self.filtered = self.sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") + self.filtered = Globals.scsingleton.sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") self.filtered.registerTempTable("wiki_ratings") def num_edits(self): - return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id") + return Globals.scsingleton.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id") def any_interact(self): - return self.sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings group by user_id, article_id") + return Globals.scsingleton.sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings group by user_id, article_id") def num_edits_ceil(self): - return self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id")\ + return Globals.scsingleton.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id")\ .map(lambda (user, article, rating): (user, article, max(rating, 5))) class WikiContentVector(ContentVector, Wiki): def __init__(self): super(self.__class__, self).__init__() - self.filtered_content = sqlCtx.sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") + self.filtered_content = Globals.scsingleton.sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") self.filtered_content.registerTempTable("wiki_content") def glove(self): From 6300ad5f2112240ec72e31c83b12ddb3d9a4a58e Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Tue, 12 Jan 2016 13:41:48 -0800 Subject: [PATCH 26/39] implement so that data class does not know about vector classes --- hermes/modules/data.py | 11 ++++++----- hermes/modules/hermesglobals.py | 5 ++--- hermes/modules/vectorgenerator.py | 14 ++++++++++++-- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/hermes/modules/data.py b/hermes/modules/data.py index 47d42bd..b2b124d 100644 --- a/hermes/modules/data.py +++ b/hermes/modules/data.py @@ -1,13 +1,14 @@ import helper -import vectorgenerator # TODO: avoid this? +from hermesglobals import Globals # TODO: a better way of storing configuration from configuration file? class Data(object): """ Store configuration from configuration files. """ def __init__(self, datapath, vector_transformation, schemapath, dataname): - if helper.is_filepath_valid(datapath): - self.datapath = datapath + #if not helper.is_filepath_valid(datapath): + # raise OSError + self.datapath = datapath self.dataname = dataname self.vector_transformation = vector_transformation self.schema = helper.get_schema(schemapath) @@ -28,12 +29,12 @@ def set_dataframe(self, scsingleton, datapath_in_hdfs): class UserVectorData(Data): def __init__(self, datapath, vector_transformation, schemapath, dataname): super(self.__class__, self).__init__(datapath, vector_transformation, schemapath, dataname) - self.which_vector = vectorgenerator.UserVector + self.which_vector = Globals.constants.USERVECTOR class ContentVectorData(Data): def __init__(self, datapath, vector_transformation, schemapath, dataname): super(self.__class__, self).__init__(datapath, vector_transformation, schemapath, dataname) - self.which_vector = vectorgenerator.ContentVector + self.which_vector = Globals.constants.CONTENTVECTOR diff --git a/hermes/modules/hermesglobals.py b/hermes/modules/hermesglobals.py index 56964c7..bac595d 100644 --- a/hermes/modules/hermesglobals.py +++ b/hermes/modules/hermesglobals.py @@ -1,8 +1,5 @@ import os - - - class Globals(object): """Globals contains global variables shared by all files. @@ -17,6 +14,8 @@ class Globals(object): class Constants(object): def __init__(self): + self.USERVECTOR = "UserVector" + self.CONTENTVECTOR = "ContentVector" self.DIR_VECTORS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "vg" self.DIR_RECOMMENDERS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "rg" self.DIR_METRICS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "mg" diff --git a/hermes/modules/vectorgenerator.py b/hermes/modules/vectorgenerator.py index ef6b660..b2213df 100644 --- a/hermes/modules/vectorgenerator.py +++ b/hermes/modules/vectorgenerator.py @@ -10,7 +10,12 @@ class VectorFactory(object): def create_vector(self, data, support_files): - vector = data.which_vector + # select which vector to create + vector = None + if data.which_vector == Globals.constants.USERVECTOR: + vector = UserVector + elif data.which_vector == Globals.constants.CONTENTVECTOR: + vector = ContentVector # get subclasses that inherit from either UserVector or ContentVector # from modules in hermes/hermes/modules/vectors directory for module in helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH): @@ -22,7 +27,12 @@ def create_vector(self, data, support_files): raise ValueError def create_obj_vector(self, data, support_files): - vector = data.which_vector + # select which vector to create + vector = None + if data.which_vector == Globals.constants.USERVECTOR: + vector = UserVector + elif data.which_vector == Globals.constants.CONTENTVECTOR: + vector = ContentVector # get subclasses that inherit from either UserVector or ContentVector # from modules in hermes/hermes/modules/vectors directory for module in helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH): From 9559c62eaa6c85b7f5e2b406dde07365a99737e9 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Tue, 12 Jan 2016 15:15:07 -0800 Subject: [PATCH 27/39] Data class does not need to know about the SparkContext --- hermes/hermes.py | 2 +- hermes/modules/data.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hermes/hermes.py b/hermes/hermes.py index f219a69..3e58be2 100644 --- a/hermes/hermes.py +++ b/hermes/hermes.py @@ -57,7 +57,7 @@ def __json_to_rdd(cargo): if Globals.verbose: Globals.logger.debug("Creating dataframe based on the content of the json file") datapath_in_hdfs = "hdfs://" + cargo.fs_default_ip_addr + "/" + cargo.hdfs_dir + "/" + os.path.basename(data.datapath) - data.set_dataframe(Globals.scsingleton, datapath_in_hdfs) + data.set_dataframe(Globals.scsingleton.sc, Globals.scsingleton.sqlCtx, datapath_in_hdfs) if Globals.verbose: Globals.logger.debug("Creating RDD based on the computed dataframe and configuration provided by the user") cargo.vectors.append( vg.VectorFactory().create_obj_vector(data, cargo.support_files) ) diff --git a/hermes/modules/data.py b/hermes/modules/data.py index b2b124d..c0e345c 100644 --- a/hermes/modules/data.py +++ b/hermes/modules/data.py @@ -15,12 +15,12 @@ def __init__(self, datapath, vector_transformation, schemapath, dataname): self.dataframe = None # TODO: do we need to know from which config the data is from? - def set_dataframe(self, scsingleton, datapath_in_hdfs): - self.dataframe = scsingleton.sqlCtx.read.json(datapath_in_hdfs, self.schema) + def set_dataframe(self, sc, sqlCtx, datapath_in_hdfs): + self.dataframe = sqlCtx.read.json(datapath_in_hdfs, self.schema) # explicitly repartition RDD after loading so that more tasks can run on it in parallel # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster # TODO: a better way to go about the dataframe repartition? - self.dataframe = self.dataframe.repartition(scsingleton.sc.defaultParallelism * 3) + self.dataframe = self.dataframe.repartition(sc.defaultParallelism * 3) # set schema if it is not already set if self.schema is None: From 99be88a06fc9e56263b5481d46980376a5223303 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Tue, 12 Jan 2016 15:37:17 -0800 Subject: [PATCH 28/39] add submodules in modules's __init__.py --- hermes/modules/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hermes/modules/__init__.py b/hermes/modules/__init__.py index ea3399c..dee4efd 100644 --- a/hermes/modules/__init__.py +++ b/hermes/modules/__init__.py @@ -9,6 +9,10 @@ import statemachine import timer import vectorgenerator -import vg -import rg +import vg.movielens_vectorgenerator +import vg.wiki_vectorgenerator +import rg.default_usecase +import rg.interface +import rg.with_tfidf_usecase +import rg.without_tfidf_usecase import mg \ No newline at end of file From 58d2394b0f717506ae324e36c869282aca129c9d Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Tue, 12 Jan 2016 15:42:37 -0800 Subject: [PATCH 29/39] load module from zip file because in notebook, hermes is zipped up as a zipfile --- hermes/metrics/cf.py | 184 ++++++++++++++++++++++++++++++ hermes/modules/helper.py | 19 +++ hermes/modules/hermesglobals.py | 10 +- hermes/modules/vectorgenerator.py | 26 ++++- 4 files changed, 231 insertions(+), 8 deletions(-) create mode 100644 hermes/metrics/cf.py diff --git a/hermes/metrics/cf.py b/hermes/metrics/cf.py new file mode 100644 index 0000000..edf1380 --- /dev/null +++ b/hermes/metrics/cf.py @@ -0,0 +1,184 @@ +from sklearn.metrics.pairwise import cosine_similarity +from pyspark.sql.types import * +from pyspark.mllib.recommendation import ALS +import numpy as np + +def calc_cf_mllib(y_training_data): + """ + Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings + + Args: + y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + + Returns: + predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. + + """ + + model = ALS.train(y_training_data, rank = 10, iterations = 5) + #predict all user, item pairs + item_ids = y_training_data.map(lambda (u,i,r): i).distinct() + user_ids = y_training_data.map(lambda (u,i,r): u).distinct() + user_item_combo = user_ids.cartesian(item_ids) + + predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1]))) + + return predicted + + +def calc_user_user_cf(training_data): + """ + A very simple user-user CF algorithm in PySpark. Method is less stable than calc_user_user_cf2 + + Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota) + and Prof Michael Ekstrand (Texas State University) + + Args: + y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + + Returns: + predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. + + """ + + user_groups = training_data.groupBy(lambda (user, item, rating): user) + + user_groups_sim = user_groups.cartesian(user_groups).map(lambda ((user1_id, user1_rows), (user2_id, user2_rows)):\ + (user1_id, user2_id, similarity(user1_rows, user2_rows, 1))) + fields = [StructField("user1", LongType(),True),StructField("user2", LongType(), True),\ + StructField("similarity", FloatType(), True) ] + schema_sim = StructType(fields) + user_sim = sqlCtx.createDataFrame(user_groups_sim, schema_sim) + user_sim.registerTempTable("user_sim") + + fields = [StructField("user", LongType(),True),StructField("item", LongType(), True),\ + StructField("rating", FloatType(), True) ] + schema = StructType(fields) + user_sim_sql = sqlCtx.createDataFrame(training_data, schema) + user_sim_sql.registerTempTable("ratings") + + avg_ratings = sqlCtx.sql("select user, avg(rating) as avg_rating from ratings group by user") + avg_ratings.registerTempTable("averages") + + residual_ratings = sqlCtx.sql("select r.user, r.item, (r.rating-a.avg_rating) as resids from ratings r, \ + averages a where a.user=r.user") + residual_ratings.registerTempTable("residuals") + + user_sim_resids = sqlCtx.sql("select u.user2, r.user, r.item, r.resids, similarity, r.resids*similarity as r_w from residuals r, \ + user_sim u where r.user=u.user1") + user_sim_resids.registerTempTable("user_sim_resids") + + item_adjusts = sqlCtx.sql("select user2, item, sum(r_w)/sum(abs(similarity)) as item_adj from user_sim_resids group by user2, item") + item_adjusts.registerTempTable("item_adjusts") + + predictions = sqlCtx.sql("select user2 as user, item, (avg_rating +item_adj) as pred_rating \ + from item_adjusts i, averages a where a.user=i.user2") + + return predictions + +def calc_user_user_cf2(training_data): + """ + A very simple user-user CF algorithm in PySpark. Method is more stable than calc_user_user_cf + + Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota) + and Prof Michael Ekstrand (Texas State University) + + Args: + y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + + Returns: + predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. + + """ + + user_groups = training_data.groupBy(lambda (user, item, rating): user) + + user_groups_sim = user_groups.cartesian(user_groups).map(lambda ((user1_id, user1_rows), (user2_id, user2_rows)):\ + (user1_id, user2_id, similarity(user1_rows, user2_rows, 1))) + + user_averages = training_data.map(lambda (user, item, rating): (user, (rating))).groupByKey().\ + map(lambda (user, ratings): (user, np.mean(list(ratings)))) + + user_resids = training_data.map(lambda (user, item, rating): (user, (item, rating))).join(user_averages)\ + .map(lambda (user, ((item, rating), avg_rating)): (user, (item, rating-avg_rating))) + + item_adjustments = user_resids.join(user_groups_sim.map(lambda (u1, u2, sim): (u1, (u2, sim))))\ + .map(lambda (u1, ((item, resid), (u2, sim))): ((u2,item), (resid*sim, sim))).\ + groupByKey().map(lambda ((user, item), sim_list): (user, item, calc_item_adjust(sim_list))) + + predictions = item_adjustments.map(lambda (user, item, item_adj): (user, (item, item_adj))).join(user_averages)\ + .map(lambda (user, ((item, item_adj), (avg_rate))): (user, item, avg_rate+item_adj)) + + return predictions + +def calc_item_adjust(sim_resids): + #data coming into this function is a list of [residual*similarity, similarity] for all user, item paris + #we want to output sum(residual*similarity)/sum(abs(similarity)) + sum_r_w = 0 + sum_sim = 0 + for s in sim_resids: + sum_r_w += s[0] + sum_sim += abs(s[1]) + + if sum_sim ==0: + return 0 + else: + return sum_r_w/sum_sim + +def calc_item_item_cf(training_data): + """ + A very simple item-item CF algorithm in PySpark. + + Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota) + and Prof Michael Ekstrand (Texas State University) + + Args: + y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + + Returns: + predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. + + """ + + item_groups = training_data.groupBy(lambda (user, item, rating): item) + item_similarity = item_groups.cartesian(item_groups).map(lambda ((item1_id, item1_rows), (item2_id, item2_rows)):\ + (item1_id, item2_id, similarity(item1_rows, item2_rows, 0))) + + user_item_sim = training_data.keyBy(lambda (user, item, rating): item)\ + .join(item_similarity.keyBy(lambda (item1, item2, sim): item1))\ + .map(lambda (item_id,((user, item, rating),(item1, item2, sim))):((user, item2), (item,rating,sim)))\ + .filter(lambda ((user, item2), (item,rating,sim)): item2!=item) + + predictions = user_item_sim.groupByKey()\ + .map(lambda ((user, item), rows): (user, item, get_item_prob(rows))) + + return predictions + +def similarity(item1_rows, item2_rows, index): + #to determine user similarity index=0 + #to determine item similarity index=1 + rating_match = [] + for i in item1_rows: + for j in item2_rows: + if i[index]==j[index]: + rating_match.append((i[2],j[2])) + + if len(rating_match)==0: + sim = 0.0 + else: + sim = cosine_similarity(*zip(*rating_match))[0][0] + + return float(sim) + +def get_item_prob(rows): + nom = 0 + denom = 0 + for r in rows: + nom += r[1]*r[2] + denom += abs(r[2]) + + if denom ==0: + return 0 + else: + item_prob = nom/denom + return float(item_prob) \ No newline at end of file diff --git a/hermes/modules/helper.py b/hermes/modules/helper.py index 6c74ced..bf0f76f 100644 --- a/hermes/modules/helper.py +++ b/hermes/modules/helper.py @@ -7,6 +7,8 @@ import md5 import os import traceback +import zipfile +import zipimport from pyspark.sql.types import StructType @@ -21,6 +23,23 @@ def get_schema(schema_path): with open(schema_path, "r") as schema_file: return StructType.fromJson(json.load(schema_file)) +def load_modules_in_zip(zipfile_path, which_dir): + try: + try: + zh = zipfile.ZipFile(zipfile_path) + zi = zipimport.zipimporter(zipfile_path) + for name in zh.namelist(): + if os.path.basename(os.path.dirname(name)) == which_dir: + module = zi.load_module(os.path.splitext(name)[0]) + yield module + finally: + try: zh.close() + except: pass + except Exception as err: + Globals.logger.error(err, exc_info=True) + raise + + def load_modules_in_dir(dir_path): try: try: diff --git a/hermes/modules/hermesglobals.py b/hermes/modules/hermesglobals.py index bac595d..4cf5115 100644 --- a/hermes/modules/hermesglobals.py +++ b/hermes/modules/hermesglobals.py @@ -16,9 +16,13 @@ class Constants(object): def __init__(self): self.USERVECTOR = "UserVector" self.CONTENTVECTOR = "ContentVector" - self.DIR_VECTORS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "vg" - self.DIR_RECOMMENDERS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "rg" - self.DIR_METRICS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + "mg" + self.ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + self.DIR_VECTORS_NAME = "vg" + self.DIR_VECTORS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + self.DIR_VECTORS_NAME + self.DIR_RECOMMENDERS_NAME = "rg" + self.DIR_RECOMMENDERS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + self.DIR_RECOMMENDERS_NAME + self.DIR_METRICS_NAME = "mg" + self.DIR_METRICS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + self.DIR_METRICS_NAME def __setattr__(self, attr, value): if hasattr(self, attr): diff --git a/hermes/modules/vectorgenerator.py b/hermes/modules/vectorgenerator.py index b2213df..cd5ab9a 100644 --- a/hermes/modules/vectorgenerator.py +++ b/hermes/modules/vectorgenerator.py @@ -9,16 +9,24 @@ # ================================================================================ class VectorFactory(object): - def create_vector(self, data, support_files): + def create_vector(self, data, support_files, runs_from_notebook=False): # select which vector to create vector = None if data.which_vector == Globals.constants.USERVECTOR: vector = UserVector elif data.which_vector == Globals.constants.CONTENTVECTOR: vector = ContentVector + else: + raise Exception + # select if we are loading modules from a directory or a zip + generator = None + if runs_from_notebook: + generator = helper.load_modules_in_zip(Globals.constants.ROOT_PATH, Globals.constants.DIR_VECTORS_NAME) + else: + generator = helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH) # get subclasses that inherit from either UserVector or ContentVector # from modules in hermes/hermes/modules/vectors directory - for module in helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH): + for module in generator: for subclass in helper.get_direct_subclasses(module, vector): if subclass.isSameDataInstance(data): return subclass(data, support_files).vector @@ -26,19 +34,27 @@ def create_vector(self, data, support_files): # cannot find class that builds the data raise ValueError - def create_obj_vector(self, data, support_files): + def create_obj_vector(self, data, support_files, runs_from_notebook=False): # select which vector to create vector = None if data.which_vector == Globals.constants.USERVECTOR: vector = UserVector elif data.which_vector == Globals.constants.CONTENTVECTOR: vector = ContentVector + else: + raise Exception + # select if we are loading modules from a directory or a zip + generator = None + if runs_from_notebook: + generator = helper.load_modules_in_zip(Globals.constants.ROOT_PATH, Globals.constants.DIR_VECTORS_NAME) + else: + generator = helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH) # get subclasses that inherit from either UserVector or ContentVector # from modules in hermes/hermes/modules/vectors directory - for module in helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH): + for module in generator: for subclass in helper.get_direct_subclasses(module, vector): if subclass.isSameDataInstance(data): - return subclass(data, support_files) + return subclass(data, support_files) else: # cannot find class that builds the data raise ValueError From 08922efdcb3175014250a58194d3f546d23e0a20 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Wed, 13 Jan 2016 11:14:38 -0800 Subject: [PATCH 30/39] add assumptions about load_modules_in_zip() --- docs/assumptions.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/assumptions.md b/docs/assumptions.md index e7df056..c0685c7 100644 --- a/docs/assumptions.md +++ b/docs/assumptions.md @@ -2,6 +2,7 @@ * [Assumptions on Execution](#assumptions-on-execution) * [Assumptions on Vector Creation](#assumptions-on-vector-creation) +* [Assumptions on Directory Creation](#assumptions-on-directory-creation) ## Assumptions on Execution @@ -64,4 +65,6 @@ We have yet to determine why this is the case. When users add a new dataset, we cannot always assume that they will import exactly as "from hermes.modules.vectorgenerator import UserVector, ContentVector" because they can import it as "from modules.vectorgenerator import UserVector, ContentVector" since it is valid. For this reason, we have made an assumption that if the parent class of the MovieLensUserVector, for example, has the __name__ UserVector, MovieLensUserVector is the child of UserVector. The problem of this assummption is that if MovieLensUserVector inherits multiple parents from different module with the same class name, it can become a problem as it will treat both parents with the same class name as the same. +## Assumptions on Directory Creation +We made an assumption that there is only one directory with the label "vg", "rg", and "mg". These directories store the modules for vector, recommender, and metric creation specific to either datasets or use cases. The assumption is made in the helper function load_modules_in_zip() where it checks for the base directory of the file path if the base directory is "vg", "rg", or "mg" to load the modules in the notebook during vector, recommender, or metric creation respectively. From 785b1f317d8dba306fc9d105a76dd2cc829b8703 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Wed, 13 Jan 2016 12:12:46 -0800 Subject: [PATCH 31/39] use cf.py in recommendergenerator --- hermes/modules/rg/default_usecase.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/hermes/modules/rg/default_usecase.py b/hermes/modules/rg/default_usecase.py index f63dca1..2fde1e9 100644 --- a/hermes/modules/rg/default_usecase.py +++ b/hermes/modules/rg/default_usecase.py @@ -1,6 +1,6 @@ from interface import ImplementationInterface -import pyspark.mllib.recommendation as mllib +import hermes.metrics.cf as cf # ================================================================================ # Concrete background implementations for default use cases @@ -9,6 +9,11 @@ class Default(ImplementationInterface): def make_prediction_with_als(self, vector): # TODO: specify rank based on what the user wants + """ + import pyspark.mllib.recommendation as mllib model = mllib.ALS.train(vector.training_vector, rank=3) prediction_vector = model.predictAll( vector.test_vector.map( lambda x: (x[0], x[1]) ) ).cache() - return prediction_vector \ No newline at end of file + return prediction_vector + """ + vector.prediction_vector = cf.calc_cf_mllib(vector.training_vector) + return vector.prediction_vector \ No newline at end of file From 79f8384f0a1c4c9f432d500653a8668281fb18db Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Wed, 13 Jan 2016 12:47:27 -0800 Subject: [PATCH 32/39] convert metrics directory to algorithms directory for consistency --- hermes/__init__.py | 2 +- hermes/{metrics => algorithms}/__init__.py | 0 hermes/{metrics => algorithms}/cf.py | 0 .../{metrics => algorithms}/content_based.py | 0 .../performance_metrics.py | 0 hermes/hermes.py | 2 +- hermes/modules/metricgenerator.py | 2 +- hermes/modules/rg/default_usecase.py | 2 +- {src => hermes}/utils/osm_etl/osm.py | 0 src/algorithms/cf.py | 184 ------------------ 10 files changed, 4 insertions(+), 188 deletions(-) rename hermes/{metrics => algorithms}/__init__.py (100%) rename hermes/{metrics => algorithms}/cf.py (100%) rename hermes/{metrics => algorithms}/content_based.py (100%) rename hermes/{metrics => algorithms}/performance_metrics.py (100%) rename {src => hermes}/utils/osm_etl/osm.py (100%) delete mode 100644 src/algorithms/cf.py diff --git a/hermes/__init__.py b/hermes/__init__.py index fb408e4..33d7f0e 100644 --- a/hermes/__init__.py +++ b/hermes/__init__.py @@ -3,5 +3,5 @@ import hermesctl import hermesui import modules -import metrics +import algorithms import utils \ No newline at end of file diff --git a/hermes/metrics/__init__.py b/hermes/algorithms/__init__.py similarity index 100% rename from hermes/metrics/__init__.py rename to hermes/algorithms/__init__.py diff --git a/hermes/metrics/cf.py b/hermes/algorithms/cf.py similarity index 100% rename from hermes/metrics/cf.py rename to hermes/algorithms/cf.py diff --git a/hermes/metrics/content_based.py b/hermes/algorithms/content_based.py similarity index 100% rename from hermes/metrics/content_based.py rename to hermes/algorithms/content_based.py diff --git a/hermes/metrics/performance_metrics.py b/hermes/algorithms/performance_metrics.py similarity index 100% rename from hermes/metrics/performance_metrics.py rename to hermes/algorithms/performance_metrics.py diff --git a/hermes/hermes.py b/hermes/hermes.py index 3e58be2..c2872b9 100644 --- a/hermes/hermes.py +++ b/hermes/hermes.py @@ -131,7 +131,7 @@ def __make_prediction(cargo): # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithoutTfidf()) # etc. with Timer() as t: - thisvector.prediction_vector = recommender.make_prediction() + prediction_vector = recommender.make_prediction() if Globals.verbose: Globals.logger.debug("Making prediction takes %s seconds" % t.secs) def make_prediction_state(cargo): diff --git a/hermes/modules/metricgenerator.py b/hermes/modules/metricgenerator.py index fb74594..2ff0b7a 100644 --- a/hermes/modules/metricgenerator.py +++ b/hermes/modules/metricgenerator.py @@ -2,7 +2,7 @@ import os import sys sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/..")) -import metrics.performance_metrics as pm +import algorithms.performance_metrics as pm """ diff --git a/hermes/modules/rg/default_usecase.py b/hermes/modules/rg/default_usecase.py index 2fde1e9..0d0c88d 100644 --- a/hermes/modules/rg/default_usecase.py +++ b/hermes/modules/rg/default_usecase.py @@ -1,6 +1,6 @@ from interface import ImplementationInterface -import hermes.metrics.cf as cf +import hermes.algorithms.cf as cf # ================================================================================ # Concrete background implementations for default use cases diff --git a/src/utils/osm_etl/osm.py b/hermes/utils/osm_etl/osm.py similarity index 100% rename from src/utils/osm_etl/osm.py rename to hermes/utils/osm_etl/osm.py diff --git a/src/algorithms/cf.py b/src/algorithms/cf.py deleted file mode 100644 index edf1380..0000000 --- a/src/algorithms/cf.py +++ /dev/null @@ -1,184 +0,0 @@ -from sklearn.metrics.pairwise import cosine_similarity -from pyspark.sql.types import * -from pyspark.mllib.recommendation import ALS -import numpy as np - -def calc_cf_mllib(y_training_data): - """ - Utilizes the ALS collaborative filtering algorithm in MLLib to determine the predicted ratings - - Args: - y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] - - Returns: - predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. - - """ - - model = ALS.train(y_training_data, rank = 10, iterations = 5) - #predict all user, item pairs - item_ids = y_training_data.map(lambda (u,i,r): i).distinct() - user_ids = y_training_data.map(lambda (u,i,r): u).distinct() - user_item_combo = user_ids.cartesian(item_ids) - - predicted = model.predictAll(user_item_combo.map(lambda x: (x[0], x[1]))) - - return predicted - - -def calc_user_user_cf(training_data): - """ - A very simple user-user CF algorithm in PySpark. Method is less stable than calc_user_user_cf2 - - Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota) - and Prof Michael Ekstrand (Texas State University) - - Args: - y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] - - Returns: - predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. - - """ - - user_groups = training_data.groupBy(lambda (user, item, rating): user) - - user_groups_sim = user_groups.cartesian(user_groups).map(lambda ((user1_id, user1_rows), (user2_id, user2_rows)):\ - (user1_id, user2_id, similarity(user1_rows, user2_rows, 1))) - fields = [StructField("user1", LongType(),True),StructField("user2", LongType(), True),\ - StructField("similarity", FloatType(), True) ] - schema_sim = StructType(fields) - user_sim = sqlCtx.createDataFrame(user_groups_sim, schema_sim) - user_sim.registerTempTable("user_sim") - - fields = [StructField("user", LongType(),True),StructField("item", LongType(), True),\ - StructField("rating", FloatType(), True) ] - schema = StructType(fields) - user_sim_sql = sqlCtx.createDataFrame(training_data, schema) - user_sim_sql.registerTempTable("ratings") - - avg_ratings = sqlCtx.sql("select user, avg(rating) as avg_rating from ratings group by user") - avg_ratings.registerTempTable("averages") - - residual_ratings = sqlCtx.sql("select r.user, r.item, (r.rating-a.avg_rating) as resids from ratings r, \ - averages a where a.user=r.user") - residual_ratings.registerTempTable("residuals") - - user_sim_resids = sqlCtx.sql("select u.user2, r.user, r.item, r.resids, similarity, r.resids*similarity as r_w from residuals r, \ - user_sim u where r.user=u.user1") - user_sim_resids.registerTempTable("user_sim_resids") - - item_adjusts = sqlCtx.sql("select user2, item, sum(r_w)/sum(abs(similarity)) as item_adj from user_sim_resids group by user2, item") - item_adjusts.registerTempTable("item_adjusts") - - predictions = sqlCtx.sql("select user2 as user, item, (avg_rating +item_adj) as pred_rating \ - from item_adjusts i, averages a where a.user=i.user2") - - return predictions - -def calc_user_user_cf2(training_data): - """ - A very simple user-user CF algorithm in PySpark. Method is more stable than calc_user_user_cf - - Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota) - and Prof Michael Ekstrand (Texas State University) - - Args: - y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] - - Returns: - predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. - - """ - - user_groups = training_data.groupBy(lambda (user, item, rating): user) - - user_groups_sim = user_groups.cartesian(user_groups).map(lambda ((user1_id, user1_rows), (user2_id, user2_rows)):\ - (user1_id, user2_id, similarity(user1_rows, user2_rows, 1))) - - user_averages = training_data.map(lambda (user, item, rating): (user, (rating))).groupByKey().\ - map(lambda (user, ratings): (user, np.mean(list(ratings)))) - - user_resids = training_data.map(lambda (user, item, rating): (user, (item, rating))).join(user_averages)\ - .map(lambda (user, ((item, rating), avg_rating)): (user, (item, rating-avg_rating))) - - item_adjustments = user_resids.join(user_groups_sim.map(lambda (u1, u2, sim): (u1, (u2, sim))))\ - .map(lambda (u1, ((item, resid), (u2, sim))): ((u2,item), (resid*sim, sim))).\ - groupByKey().map(lambda ((user, item), sim_list): (user, item, calc_item_adjust(sim_list))) - - predictions = item_adjustments.map(lambda (user, item, item_adj): (user, (item, item_adj))).join(user_averages)\ - .map(lambda (user, ((item, item_adj), (avg_rate))): (user, item, avg_rate+item_adj)) - - return predictions - -def calc_item_adjust(sim_resids): - #data coming into this function is a list of [residual*similarity, similarity] for all user, item paris - #we want to output sum(residual*similarity)/sum(abs(similarity)) - sum_r_w = 0 - sum_sim = 0 - for s in sim_resids: - sum_r_w += s[0] - sum_sim += abs(s[1]) - - if sum_sim ==0: - return 0 - else: - return sum_r_w/sum_sim - -def calc_item_item_cf(training_data): - """ - A very simple item-item CF algorithm in PySpark. - - Method derived from the Coursera course: Recommender Systems taught by Prof Joseph Konstan (Universitu of Minesota) - and Prof Michael Ekstrand (Texas State University) - - Args: - y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] - - Returns: - predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. - - """ - - item_groups = training_data.groupBy(lambda (user, item, rating): item) - item_similarity = item_groups.cartesian(item_groups).map(lambda ((item1_id, item1_rows), (item2_id, item2_rows)):\ - (item1_id, item2_id, similarity(item1_rows, item2_rows, 0))) - - user_item_sim = training_data.keyBy(lambda (user, item, rating): item)\ - .join(item_similarity.keyBy(lambda (item1, item2, sim): item1))\ - .map(lambda (item_id,((user, item, rating),(item1, item2, sim))):((user, item2), (item,rating,sim)))\ - .filter(lambda ((user, item2), (item,rating,sim)): item2!=item) - - predictions = user_item_sim.groupByKey()\ - .map(lambda ((user, item), rows): (user, item, get_item_prob(rows))) - - return predictions - -def similarity(item1_rows, item2_rows, index): - #to determine user similarity index=0 - #to determine item similarity index=1 - rating_match = [] - for i in item1_rows: - for j in item2_rows: - if i[index]==j[index]: - rating_match.append((i[2],j[2])) - - if len(rating_match)==0: - sim = 0.0 - else: - sim = cosine_similarity(*zip(*rating_match))[0][0] - - return float(sim) - -def get_item_prob(rows): - nom = 0 - denom = 0 - for r in rows: - nom += r[1]*r[2] - denom += abs(r[2]) - - if denom ==0: - return 0 - else: - item_prob = nom/denom - return float(item_prob) \ No newline at end of file From 3bf5cd6b5982c20785a85b202dfc2bda5637a9b1 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Thu, 14 Jan 2016 13:53:23 -0800 Subject: [PATCH 33/39] wip: add uservector and contentvector in recommenders --- .../algorithms/content_based_kmeans.py | 0 .../data_prep/movieLens_vectorize.py | 100 +++++++++++ hermes/algorithms/data_prep/wiki_vectorize.py | 160 ++++++++++++++++++ .../algorithms/recommender_helpers.py | 0 hermes/hermes.py | 9 +- hermes/hermesctl.py | 141 ++++++++++----- hermes/modules/data.py | 3 +- hermes/modules/metricgenerator.py | 9 + hermes/modules/recommendergenerator.py | 20 ++- hermes/modules/rg/default_usecase.py | 17 +- hermes/modules/rg/interface.py | 6 + hermes/modules/rg/second_usecase.py | 14 ++ hermes/modules/vectorgenerator.py | 11 +- 13 files changed, 437 insertions(+), 53 deletions(-) rename {src => hermes}/algorithms/content_based_kmeans.py (100%) create mode 100644 hermes/algorithms/data_prep/movieLens_vectorize.py create mode 100644 hermes/algorithms/data_prep/wiki_vectorize.py rename {src => hermes}/algorithms/recommender_helpers.py (100%) create mode 100644 hermes/modules/rg/second_usecase.py diff --git a/src/algorithms/content_based_kmeans.py b/hermes/algorithms/content_based_kmeans.py similarity index 100% rename from src/algorithms/content_based_kmeans.py rename to hermes/algorithms/content_based_kmeans.py diff --git a/hermes/algorithms/data_prep/movieLens_vectorize.py b/hermes/algorithms/data_prep/movieLens_vectorize.py new file mode 100644 index 0000000..1d038bb --- /dev/null +++ b/hermes/algorithms/data_prep/movieLens_vectorize.py @@ -0,0 +1,100 @@ +import numpy as np + +class movieLens_vectorize(): + + def __init__(self, user_interactions, content, user_vector_type, content_vector_type, **support_files ): + """ + Class initializer to load the required files + + Args: + user_interactions: The raw RDD of the user interactions. For MovieLens, these are the ratings + content: The raw RDD containing the item content. For MovieLens, this is the movie categories + user_vector_type: The type of user vector desired. For MovieLens you can choose between ['ratings', 'pos_ratings', 'ratings_to_interact', 'none']. + If 'none' is used then this means you will run your own custom mapping + content_vector_type: The type of content vector desired. For MovieLens you can choose between ['genre', 'none']. + If none is chosen no content vector will be returned and None may be passed into the content argument. + You do not need a content vector to run pure CF only but some performance metrics will not be able to be ran + support_files: If they exist, the supporting files, dataFrames, and/or file links necessary to run the content vectors. + + + """ + self.user_vector_type = user_vector_type + self.content_vector_type = content_vector_type + + #Filter out uninteresting articles and users if they still exist in the dataset + self.user_interactions =user_interactions + self.user_interactions.registerTempTable("ratings") + self.content = content + self.content.registerTempTable("content") + + #if no support files were passed in, initialize an empty support file + if support_files: + self.support_files = support_files + else: + self.support_files = {} + + + def get_user_vector(self): + + if self.user_vector_type=='ratings': + user_info = self.user_interactions.map(lambda row: (row.user_id, row.movie_id, row.rating) ) + return user_info + + elif self.user_vector_type=='pos_ratings': + user_info = self.user_interactions.map(lambda row: (row.user_id, row.movie_id, row.rating) ).filter(lambda (u,m,r): r>3) + return user_info + + elif self.user_vector_type=='ratings_to_interact': + user_info = self.user_interactions.map(lambda row: (row.user_id, row.movie_id, rating_to_interaction(row.rating)) ) + return user_info + + elif self.user_vector_type=='none': + return None + + else: + print "Please choose a user_vector_type between 'ratings', 'pos_ratings', 'ratings_to_interact', and 'none'" + return None + + def get_content_vector(self): + + if self.content_vector_type=='genre': + content_array = self.content.map(lambda row: (row.movie_id, genre_vectorizer(row))) + return content_array + + elif self.content_vector_type=='none': + return None + + else: + print "Please choose a content_vector_type between 'genre' or 'none'" + return None + + + +def rating_to_interaction(rating): + if rating<3: + return -1 + else: + return 1 + + +def genre_vectorizer(row): + return np.array(( + int(row.genre_action), + int(row.genre_adventure), + int(row.genre_animation), + int(row.genre_childrens), + int(row.genre_comedy), + int(row.genre_crime), + int(row.genre_documentary), + int(row.genre_drama), + int(row.genre_fantasy), + int(row.genre_filmnoir), + int(row.genre_horror), + int(row.genre_musical), + int(row.genre_mystery), + int(row.genre_romance), + int(row.genre_scifi), + int(row.genre_thriller), + int(row.genre_war), + int(row.genre_western), + )) \ No newline at end of file diff --git a/hermes/algorithms/data_prep/wiki_vectorize.py b/hermes/algorithms/data_prep/wiki_vectorize.py new file mode 100644 index 0000000..37d2eb6 --- /dev/null +++ b/hermes/algorithms/data_prep/wiki_vectorize.py @@ -0,0 +1,160 @@ +from src.utils import article_to_category, glove, remove_templates, clean_categories, clean_links +import string +import numpy as np + +class wiki_vectorize(): + + def __init__(self, user_interactions, content, user_vector_type, content_vector_type, **support_files): + """ + Class initializer to load the required files + + Args: + user_interactions: The raw RDD of the user interactions. For Wikipedia, this it is the full edit history. + We have been reading it in as wiki_edits = sqlCtx.read.json(wiki_edit_json_data_path, schema=schema) + content: The raw RDD containing the item content. For Wikipedia, this is the latest edit which contains full article content + user_vector_type: The type of user vector desired. For Wikipedia you can choose between ['num_edits', 'any_interact', 'num_edits_ceil', 'none']. + num_edits_ceil will count the number of edits but set an upper limit of 5 edits + If 'none' is used then this means you will run your own custom mapping + content_vector_type: The type of content vector desired. For Wikipedia you can choose between ['glove', 'category_map', 'none']. + If none is chosen no content vector will be returned and None may be passed into the content argument. + You do not need a content vector to run pure CF only but some performance metrics will not be able to be ran + support_files: If they exist, the supporting files, dataFrames, and/or file links necessary to run the content vectors. + For example the category_map function at least needs the category_list from dbPedia + + """ + self.user_vector_type = user_vector_type + self.content_vector_type = content_vector_type + + #Filter out uninteresting articles and users if they still exist in the dataset + user_interactions.registerTempTable("ratings") + content.registerTempTable("content") + + filtered = sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") + filtered_content = sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") + + self.filtered = filtered + self.filtered.registerTempTable("wiki_ratings") + + self.filtered_content = filtered_content + self.filtered_content.registerTempTable("wiki_content") + + #if no support files were passed in, initialize an empty support file + if support_files: + self.support_files = support_files + else: + self.support_files = {} + + + def get_user_vector(self): + + if self.user_vector_type=='num_edits': + user_info = sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings \ + group by user_id, article_id") + + return user_info + + elif self.user_vector_type=='any_interact': + user_info = sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings \ + group by user_id, article_id") + + return user_info + + elif self.user_vector_type=='num_edits_ceil': + user_info = sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki \ + group by user_id, article_id")\ + .map(lambda (user, article, rating): (user, article, max(rating, 5))) + + return user_info + + elif self.user_vector_type=='none': + return None + + else: + print "Please choose a user_vector_type between num_edits, any_interact, num_edits_ceil or none" + return None + + + def get_content_vector(self): + if self.content_vector_type=='glove': + + if self.support_files==1: + glove_model = self.support_files["glove_model"] + + article_mapping = self.filtered_content\ + .map(lambda row: (row.article_id, remove_templates(row.full_text)))\ + .map(lambda tup: (tup[0],clean_categories(tup[1])))\ + .map(lambda tup: (tup[0],clean_links(tup[1])))\ + .map( + lambda tup: + (tup[0], tup[1]\ + .replace('\n', ' ')\ + .replace("", '')\ + .replace("", '')\ + ) + )\ + .map(lambda tup: (tup[0], remove_punctuation(tup[1])))\ + .map(lambda tup: (tup[0], remove_urls(tup[1])))\ + .map(lambda tup: (tup[0], article_to_glove(tup[1], glove_model))) + + return article_mapping + + else: + print "Please pass in a glove_model. Like: support_files['glove_model']=Glove('glove.6B.50d.txt')" + elif self.content_vector_type=='category_map': + + if len(self.support_files)==3: + #The category map supporting dataFrames and objects are as followed: + #high_level_idx: An array of the high level categories to map to e.g. ['Concepts', 'Life', 'Physical_universe', 'Society'] + #category_index_graph_link: Path to the csv of the category links as created from wiki_categories.create_linked_list() + #category_idx: Dictionary of the categories to an index as created from wiki_categories.create_category_idx_dicts() + + high_level_categories = self.support_files['high_level_categories'] + category_index_graph_link = self.support_files['category_index_graph_link'] + category_idx = self.support_file['category_idx'] + + ac = article_to_category(high_level_categories, category_index_graph_link, category_idx) + article_mapping = ac.run_mapping(self.filtered_content) + + return article_mapping + + else: + #print "To run category map you must at least have the category_list from dbPedia" + ##TODO work on the article_to_category function so that it can just pull in the category list from dpPedia + print "Please pass in the following files:" + print "high_level_idx: An array of the high level categories to map to e.g. ['Concepts', 'Life', 'Physical_universe', 'Society']" + print 'category_index_graph_link: Path to the csv of the category links as created from wiki_categories.create_linked_list()' + print 'category_idx: Dictionary of the categories to an index as created from wiki_categories.create_category_idx_dicts()' + print 'support_files = {"high_level_categories" : high_level_categories, \ + "category_index_graph_link" : category_index_graph_link, \ + "category_idx" : category_idx}' + return None + + elif self.content_vector_type=='none': + return None + + else: + print "Please choose between glove, category_map or none" + return None + +def remove_punctuation(text): + for char in string.punctuation: + text = text.replace(char, '') + return text + +def article_to_glove(text, model): + vec = np.zeros(model.vector_size) + for word in text.split(): + vec += model[word.lower()] + + return vec + +def remove_urls(text): + stext = text.split() + next_text = [] + for word in stext: + if word.startswith('http'): + continue + else: + next_text.append(word) + + return ' '.join(next_text) \ No newline at end of file diff --git a/src/algorithms/recommender_helpers.py b/hermes/algorithms/recommender_helpers.py similarity index 100% rename from src/algorithms/recommender_helpers.py rename to hermes/algorithms/recommender_helpers.py diff --git a/hermes/hermes.py b/hermes/hermes.py index c2872b9..cd8c40a 100644 --- a/hermes/hermes.py +++ b/hermes/hermes.py @@ -111,18 +111,23 @@ def __make_prediction(cargo): # select which recommenders based on the vector type recommenders = None + thisvector_uservector = None + thisvector_contentvector = None if helper.is_direct_subclass(thisvector, vg.UserVector): if Globals.verbose: Globals.logger.debug("Iterating through recommenders for user vector on data %s", thisvector.data.datapath) + thisvector_uservector = thisvector recommenders = cargo.user_recommenders elif helper.is_direct_subclass(thisvector, vg.ContentVector): if Globals.verbose: Globals.logger.debug("Iterating through recommenders for content vector on data %s", thisvector.data.datapath) + thisvector_contentvector = thisvector + thisvector_uservector = thisvector.uservector recommenders = cargo.content_recommenders # run all recommenders on the vector for r in recommenders: if Globals.verbose: Globals.logger.debug("Making recommendation %s on data %s", r, thisvector.data.datapath) # TODO: implement other use case, ie. WithTfidf(), etc. - recommender = rg.RecommenderFactory().create_obj_recommender(r, thisvector) + recommender = rg.RecommenderFactory().create_obj_recommender(r, thisvector_uservector, thisvector_contentvector) # default use case # recommender = RecommenderFactory().create_obj_recommender(r, vector, Default()) # with tf-idf use case @@ -158,8 +163,10 @@ def __calculate_metrics(cargo): Globals.logger.info("-" * 80) Globals.logger.info("Data: %s" % cargo.vectors[i].data.datapath) for m in cargo.metrics: + Globals.logger.info("Metric: %s" % (m)) # check if metric exists metric = mg.MetricFactory().create_obj_metric(m) + Globals.logger.info(metric) # set metric in executor executor.change_metric(metric) # execute the metric diff --git a/hermes/hermesctl.py b/hermes/hermesctl.py index 50bbc3e..b4e1127 100644 --- a/hermes/hermesctl.py +++ b/hermes/hermesctl.py @@ -145,23 +145,31 @@ def handle_dataset_section(dataset_items, config_path): dataname = datasets_items["dataname"] lofmap = config.map_section(lofcp, dataname) - # create UserVectorData or ContentVectorData or both hasUserVector = False # check it has the required items to build a UserVectorData if set(config.REQ_UV_HEADINGS) < set(datasets_items.keys()): hasUserVector = True - create_datas(lofmap, dataname, datasets_items, config_path, isUserVector=True) hasContentVector = False # check it has the required items to build a ContentVectorData if set(config.REQ_CV_HEADINGS) < set(datasets_items.keys()): hasContentVector = True - create_datas(lofmap, dataname, datasets_items, config_path, isUserVector=False) - if not hasUserVector and not hasContentVector: + if not hasContentVector and not hasUserVector: Globals.logger.error("ERROR: config " + config_path + " does not have declaration for a user vector or a content vector") sys.exit() + if hasContentVector and not hasUserVector: + Globals.logger.error("ERROR: config " + config_path + " does not have declaration for a user vector when a content vector is declared") + sys.exit() + + if hasContentVector and hasUserVector: + # create content vector data + create_datas(lofmap, dataname, datasets_items, config_path, isUserVector=False) + else: + # create user vector data + create_datas(lofmap, dataname, datasets_items, config_path, isUserVector=True) + def create_datas(lofmap, dataname, datasets_items, config_path, isUserVector): """ Helper function that creates a UserVectorData or ContentVectorData depending if it isUserVector or not. @@ -170,48 +178,105 @@ def create_datas(lofmap, dataname, datasets_items, config_path, isUserVector): cargo's data list. """ + # TODO: rewrite this, quick fix for now if isUserVector: datapaths_heading = "user_vector_data" vector_transformations_heading = "user_vector_transformations" schemapaths_heading = "user_vector_schemas" - else: - datapaths_heading = "content_vector_data" - vector_transformations_heading = "content_vector_transformations" - schemapaths_heading = "content_vector_schemas" - - datapaths = json.loads(datasets_items[datapaths_heading]) - vector_transformations = json.loads(datasets_items[vector_transformations_heading]) - hasSchemas = False - if schemapaths_heading in datasets_items.keys(): - schemapaths = json.loads(datasets_items[schemapaths_heading]) - hasSchemas = True - - # check that a vector transformation is specified for each data - # TODO: multiple vector trasnformation for each data in the future? - if len(datapaths) != len(vector_transformations): - Globals.logger.error("ERROR: must specify a vector type for each data in config " + config_path) - sys.exit() - for i in range(0, len(datapaths)): - # set datapath - try: - datapath = lofmap[datapaths[i]] - except KeyError: - Globals.logger.error("ERROR: cannot find data " + datapath + " in the list_of_files_config for config " + config_path) + datapaths = json.loads(datasets_items[datapaths_heading]) + vector_transformations = json.loads(datasets_items[vector_transformations_heading]) + hasSchemas = False + if schemapaths_heading in datasets_items.keys(): + schemapaths = json.loads(datasets_items[schemapaths_heading]) + hasSchemas = True + + # check that a vector transformation is specified for each data + # TODO: multiple vector trasnformation for each data in the future? + if len(datapaths) != len(vector_transformations): + Globals.logger.error("ERROR: must specify a vector type for each data in config " + config_path) sys.exit() - # set vector_transformation - vector_transformation = vector_transformations[i] - # set schemapath - try: - if hasSchemas: schemapath = lofmap[schemapaths[i]] - except IndexError, KeyError: - schemapath = None - - if isUserVector: + + for i in range(0, len(datapaths)): + # set datapath + try: + datapath = lofmap[datapaths[i]] + except KeyError: + Globals.logger.error("ERROR: cannot find data " + datapath + " in the list_of_files_config for config " + config_path) + sys.exit() + # set vector_transformation + vector_transformation = vector_transformations[i] + # set schemapath + try: + if hasSchemas: schemapath = lofmap[schemapaths[i]] + except IndexError, KeyError: + schemapath = None + uservectordata = UserVectorData(datapath, vector_transformation, schemapath, dataname) cargo.datas.append(uservectordata) - else: - contentvectordata = ContentVectorData(datapath, vector_transformation, schemapath, dataname) + + else: + # user vector + uv_datapaths_heading = "user_vector_data" + uv_vector_transformations_heading = "user_vector_transformations" + uv_schemapaths_heading = "user_vector_schemas" + + uv_datapaths = json.loads(datasets_items[uv_datapaths_heading]) + uv_vector_transformations = json.loads(datasets_items[uv_vector_transformations_heading]) + uv_hasSchemas = False + if uv_schemapaths_heading in datasets_items.keys(): + uv_schemapaths = json.loads(datasets_items[uv_schemapaths_heading]) + uv_hasSchemas = True + + # content vector + cv_datapaths_heading = "content_vector_data" + cv_vector_transformations_heading = "content_vector_transformations" + cv_schemapaths_heading = "content_vector_schemas" + + cv_datapaths = json.loads(datasets_items[cv_datapaths_heading]) + cv_vector_transformations = json.loads(datasets_items[cv_vector_transformations_heading]) + cv_hasSchemas = False + if cv_schemapaths_heading in datasets_items.keys(): + cv_schemapaths = json.loads(datasets_items[cv_schemapaths_heading]) + cv_hasSchemas = True + + # check that a vector transformation is specified for each data + # TODO: multiple vector trasnformation for each data in the future? + if len(cv_datapaths) != len(cv_vector_transformations) or len(uv_datapaths) != len(uv_vector_transformations): + Globals.logger.error("ERROR: must specify a vector type for each data in config " + config_path) + sys.exit() + + if len(cv_datapaths) != len(uv_datapaths): + Globals.logger.error("ERROR: content vector must have a corresponding user vector") + sys.exit() + + for i in range(0, len(cv_datapaths)): + # set datapath + try: + cv_datapath = lofmap[cv_datapaths[i]] + except KeyError: + Globals.logger.error("ERROR: cannot find data " + cv_datapath + " in the list_of_files_config for config " + config_path) + sys.exit() + try: + uv_datapath = lofmap[uv_datapaths[i]] + except KeyError: + Globals.logger.error("ERROR: cannot find data " + uv_datapath + " in the list_of_files_config for config " + config_path) + sys.exit() + # set vector_transformation + cv_vector_transformation = cv_vector_transformations[i] + uv_vector_transformation = uv_vector_transformations[i] + # set schemapath + try: + if cv_hasSchemas: cv_schemapath = lofmap[cv_schemapaths[i]] + except IndexError, KeyError: + cv_schemapath = None + try: + if uv_hasSchemas: uv_schemapath = lofmap[uv_schemapaths[i]] + except IndexError, KeyError: + uv_schemapath = None + + uservectordata = UserVectorData(uv_datapath, uv_vector_transformation, uv_schemapath, dataname) + contentvectordata = ContentVectorData(cv_datapath, cv_vector_transformation, cv_schemapath, dataname, uservectordata) cargo.datas.append(contentvectordata) # extract configs diff --git a/hermes/modules/data.py b/hermes/modules/data.py index c0e345c..092e452 100644 --- a/hermes/modules/data.py +++ b/hermes/modules/data.py @@ -32,9 +32,10 @@ def __init__(self, datapath, vector_transformation, schemapath, dataname): self.which_vector = Globals.constants.USERVECTOR class ContentVectorData(Data): - def __init__(self, datapath, vector_transformation, schemapath, dataname): + def __init__(self, datapath, vector_transformation, schemapath, dataname, uservectordata): super(self.__class__, self).__init__(datapath, vector_transformation, schemapath, dataname) self.which_vector = Globals.constants.CONTENTVECTOR + self.uservectordata = uservectordata diff --git a/hermes/modules/metricgenerator.py b/hermes/modules/metricgenerator.py index 2ff0b7a..90f08bc 100644 --- a/hermes/modules/metricgenerator.py +++ b/hermes/modules/metricgenerator.py @@ -25,6 +25,7 @@ def execute(self, vector): return self.metric.calculate_metric(vector) def change_metric(self, new_metric): + print "changing metric to %s" % new_metric self.metric = new_metric # ================================================================================ @@ -34,10 +35,12 @@ def change_metric(self, new_metric): class MetricFactory(object): def create_obj_metric(self, metric_str): which_metric = getattr(sys.modules[__name__], metric_str) + print "which_metric: ", which_metric if not which_metric: # cannot find class raise ValueError else: + print "calling on which_metric()" return which_metric() class Metric: @@ -46,10 +49,16 @@ def calculate_metric(self, vector=None) : class RMSE(Metric): def calculate_metric(self, vector): + print "executing RMSE" + print vector.test_vector.take(5) + print vector.prediction_vector.take(5) return pm.calculate_rmse(vector.test_vector, vector.prediction_vector) class MAE(Metric): def calculate_metric(self, vector): + print "executing MAE" + print vector.test_vector.take(5) + print vector.prediction_vector.take(5) return pm.calculate_mae(vector.test_vector, vector.prediction_vector) class PRFS(Metric): diff --git a/hermes/modules/recommendergenerator.py b/hermes/modules/recommendergenerator.py index fed8697..d441217 100644 --- a/hermes/modules/recommendergenerator.py +++ b/hermes/modules/recommendergenerator.py @@ -30,8 +30,9 @@ # ================================================================================ class Recommender(object): - def __init__(self, vector, implementation=Default()): - self.vector = vector + def __init__(self, user_vector, content_vector=None, implementation=Default()): + self.user_vector = user_vector + self.content_vector = content_vector self.implementation = implementation def make_prediction(self): @@ -43,13 +44,13 @@ def make_prediction(self): # ================================================================================ class RecommenderFactory(object): - def create_obj_recommender(self, recommender_str, vector, implementation=Default()): + def create_obj_recommender(self, recommender_str, user_vector, content_vector=None, implementation=Default()): which_recommender = getattr(sys.modules[__name__], recommender_str) if not which_recommender: # cannot find class raise ValueError else: - return which_recommender(vector, implementation) + return which_recommender(user_vector, content_vector, implementation) # ================================================================================ @@ -58,9 +59,16 @@ def create_obj_recommender(self, recommender_str, vector, implementation=Default class ALS(Recommender): def make_prediction(self): - return self.implementation.make_prediction_with_als(self.vector) + return self.implementation.make_prediction_with_als(self.user_vector, self.content_vector) class CBWithKMeans(Recommender): def make_prediction(self): - return self.implementation.make_prediction_with_cbkmeans(self.vector) + return self.implementation.make_prediction_with_cbkmeans(self.user_vector, self.content_vector) +class UserUser(Recommender): + def make_prediction(self): + return self.implementation.make_prediction_with_useruser(self.user_vector, self.content_vector) + +class ItemItem(Recommender): + def make_prediction(self): + return self.implementation.make_prediction_with_itemitem(self.user_vector, self.content_vector) \ No newline at end of file diff --git a/hermes/modules/rg/default_usecase.py b/hermes/modules/rg/default_usecase.py index 0d0c88d..d5e48d8 100644 --- a/hermes/modules/rg/default_usecase.py +++ b/hermes/modules/rg/default_usecase.py @@ -7,13 +7,22 @@ # ================================================================================ class Default(ImplementationInterface): - def make_prediction_with_als(self, vector): - # TODO: specify rank based on what the user wants + def make_prediction_with_als(self, user_vector, content_vector): + user_vector.prediction_vector = cf.calc_cf_mllib(user_vector.training_vector) + return user_vector.prediction_vector + """ + # TODO: specify rank based on what the user wants import pyspark.mllib.recommendation as mllib model = mllib.ALS.train(vector.training_vector, rank=3) prediction_vector = model.predictAll( vector.test_vector.map( lambda x: (x[0], x[1]) ) ).cache() return prediction_vector """ - vector.prediction_vector = cf.calc_cf_mllib(vector.training_vector) - return vector.prediction_vector \ No newline at end of file + + def make_prediction_with_useruser(self, user_vector, content_vector): + user_vector.prediction_vector = cf.calc_user_user_cf2(user_vector.training_vector) + return user_vector.prediction_vector + + def make_prediction_with_itemitem(self, user_vector, content_vector): + user_vector.prediction_vector = cf.calc_item_item_cf(user_vector.training_vector) + return user_vector.prediction_vector diff --git a/hermes/modules/rg/interface.py b/hermes/modules/rg/interface.py index 518399d..841e7f9 100644 --- a/hermes/modules/rg/interface.py +++ b/hermes/modules/rg/interface.py @@ -7,4 +7,10 @@ def make_prediction_with_als(self): raise NotImplemented def make_prediction_with_cbkmeans(self): + raise NotImplemented + + def make_prediction_with_useruser(self): + raise NotImplemented + + def make_prediction_with_itemitem(self): raise NotImplemented \ No newline at end of file diff --git a/hermes/modules/rg/second_usecase.py b/hermes/modules/rg/second_usecase.py new file mode 100644 index 0000000..99d76de --- /dev/null +++ b/hermes/modules/rg/second_usecase.py @@ -0,0 +1,14 @@ +from interface import ImplementationInterface + +import hermes.algorithms.cf as cf + +# ================================================================================ +# Concrete background implementations for default use cases +# ================================================================================ + +class SecondUseCase(ImplementationInterface): + + def make_prediction_with_useruser(self, user_vector, content_vector): + user_vector.prediction_vector = cf.calc_user_user_cf(user_vector.training_vector) + return user_vector.prediction_vector + diff --git a/hermes/modules/vectorgenerator.py b/hermes/modules/vectorgenerator.py index cd5ab9a..8bf1450 100644 --- a/hermes/modules/vectorgenerator.py +++ b/hermes/modules/vectorgenerator.py @@ -95,9 +95,14 @@ def split_data(self, weights, seed): self.validation_vector = validation_vector class ContentVector(Vector): - def __init__(self, data, support_files, user_vector): - super(ContentVector, self)._init__(data, support_files) - self.user_vector = user_vector + def __init__(self, data, support_files, uservector=None, runs_from_notebook=False): + super(ContentVector, self).__init__(data, support_files) + # TODO: terrible, quick fix -> fix it for real in the future + if uservector is not None: + self.uservector = uservector + else: + self.uservector = VectorFactory().create_obj_vector(self.data.uservectordata, support_files, runs_from_notebook) + # ================================================================================ From 72a2a29166682f06528678e709dd0a55e93e8cd9 Mon Sep 17 00:00:00 2001 From: Tiffany J Date: Mon, 25 Jan 2016 11:34:48 -0800 Subject: [PATCH 34/39] wip: integrate framework into an iPython notebook including instructions on how to seupt an iPython notebook on your computer --- docs/using_notebook.md | 65 ++ notebooks/framework_in_a_notebook.ipynb | 1098 +++++++++++++++++++++++ 2 files changed, 1163 insertions(+) create mode 100644 docs/using_notebook.md create mode 100644 notebooks/framework_in_a_notebook.ipynb diff --git a/docs/using_notebook.md b/docs/using_notebook.md new file mode 100644 index 0000000..bf156d4 --- /dev/null +++ b/docs/using_notebook.md @@ -0,0 +1,65 @@ +# How to use iPython Notebook + +1. Install Anaconda +2. Launch Anaconda luncher +3. Launch ipython-notebook +4. Create an iPython profile for use with PySpark + ```bash + ipython profile create pyspark + ``` +5. Create a iPython notebook configuration + ```bash + vim ~/.ipython/profile_pyspark/ipython_notebook_config.py + ``` + ```bash + c = get_config() + c.NotebookApp.ip = '*' + c.NotebookApp.open_browser = False + c.NotebookApp.port = 8880 + ``` +6. Create PySpark Setup configuration + ```bash + vim ~/.ipython/profile_pyspark/startup/00-pyspark-setup.py + ``` + ```bash + import os + import sys + import findspark + + # setup spark home + findspark.init() + spark_home = findspark.find() + + # add spark's home directory to path + sys.path.insert(0, os.path.join(spark_home, "python")) + + # add py4j to path + sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.8.2.1-src.zip")) + + # initialize pyspark to predefine the SparkContext variable "sc" + execfile(os.path.join(spark_home, "python/pyspark/shell.py")) + ``` + +7. Run iPython notebook in your desired directory + ```bash + ipython notebook --profile=pyspark + ``` + +8. Test to see if sc is defined. If not, setup the SparkContext and SQLContext by doing the following in your iPython notebook + ```bash + from pyspark import SparkContext + from pyspark.sql import SQLContext + + # setup SparkContext + sc = SparkContext._active_spark_context + + # setup SQLContext + sqlCtx = SQLContext(sc) + ``` + +9. When you are reading your JSON, you need to determine your fs.default.name or fs.defaultFS. You can figure this out by checking out the core-site.xml file. This can be found in Mac OS at /usr/local/Cellar/hadoop//libexec/etc/hadoop/core-site.xml. To read JSON using SQLContext, you have to add this ip address when calling the function. + +For example: your fs.default.name or fs.defaultFS is hdfs://localhost:9000. To use one of the JSON files that you have put into the datasets directory in HDFS, you have to call as follows: + ```bash + dataframe = sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_1m_movies.json.gz") + ``` diff --git a/notebooks/framework_in_a_notebook.ipynb b/notebooks/framework_in_a_notebook.ipynb new file mode 100644 index 0000000..276ae75 --- /dev/null +++ b/notebooks/framework_in_a_notebook.ipynb @@ -0,0 +1,1098 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Framework" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "debug = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Grabbing the \"framework\" branch from GitHub and use the \"hermes\" folder as a library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Step 1: Install necessary libraries." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import importlib\n", + "import pip\n", + "\n", + "def _install(package):\n", + " pip.main(['install', package])\n", + "\n", + "def _import(package):\n", + " importlib.import_module(package)\n", + " \n", + "def install_and_import(package):\n", + " try:\n", + " _import(package)\n", + " except ImportError:\n", + " _install(package)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting GitPython\n", + " Downloading GitPython-1.0.1.tar.gz (355kB)\n", + "Collecting gitdb>=0.6.4 (from GitPython)\n", + " Downloading gitdb-0.6.4.tar.gz (400kB)\n", + "Collecting smmap>=0.8.5 (from gitdb>=0.6.4->GitPython)\n", + " Downloading smmap-0.9.0.tar.gz\n", + "Building wheels for collected packages: GitPython, gitdb, smmap\n", + " Running setup.py bdist_wheel for GitPython\n", + " Stored in directory: /Users/tiffanyj/Library/Caches/pip/wheels/23/f4/31/1d0570ae6ecccca26eafb087788483f614cd740281fd842660\n", + " Running setup.py bdist_wheel for gitdb\n", + " Stored in directory: /Users/tiffanyj/Library/Caches/pip/wheels/63/1b/54/87cf226ccefad0e5fdc78e3c8c65180ac77ed2a04d1dec3a56\n", + " Running setup.py bdist_wheel for smmap\n", + " Stored in directory: /Users/tiffanyj/Library/Caches/pip/wheels/47/75/63/333cdcb6d3e6e8eb1ec6869564b84f7f1e6a875d87541a0ae9\n", + "Successfully built GitPython gitdb smmap\n", + "Installing collected packages: smmap, gitdb, GitPython\n", + "Successfully installed GitPython-1.0.1 gitdb-0.6.4 smmap-0.9.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using pip version 7.1.2, however version 8.0.2 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting click\n", + " Downloading click-6.2-py2.py3-none-any.whl (70kB)\n", + "Installing collected packages: click\n", + "Successfully installed click-6.2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using pip version 7.1.2, however version 8.0.2 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\n" + ] + } + ], + "source": [ + "install_and_import(\"GitPython\")\n", + "install_and_import(\"click\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Step 2: Create a temporary directory.\n", + "\n", + "Step 3: Git clone the \"framework\" branch from GitHub to the temporary directory.\n", + "\n", + "Step 4: Zip the hermes source files.\n", + "\n", + "Step 5: Add zip to SparkContext.\n", + "\n", + "Step 6: Remove temporary directory once it is no longer needed." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "remote_url = \"https://github.com/tiffanyj41/hermes.git\"\n", + "remote_branch = \"framework\"\n", + "source_dir = \"hermes\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# helper functions\n", + "import os\n", + "import functools\n", + "\n", + "def _list_all_in_dir(dir_path):\n", + " for path, subdirs, files in os.walk(dir_path):\n", + " for filename in files:\n", + " print os.path.join(path, filename)\n", + " \n", + "def _zip_dir(srcdir_path, zipfile_handler):\n", + " try:\n", + " zipfile_handler.writepy(srcdir_path)\n", + " finally:\n", + " zipfile_handler.close()\n", + " \n", + "def trackcalls(func):\n", + " @functools.wraps(func)\n", + " def wrapper(*args, **kwargs):\n", + " wrapper.has_been_called = True\n", + " return func(*args, **kwargs)\n", + " wrapper.has_been_called = False\n", + " return wrapper\n", + "\n", + "@trackcalls\n", + "def _add_zipfile_to_sc(zipfile_path):\n", + " sc.addPyFile(zipfile_path) " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'sc' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[1;32mprint\u001b[0m \u001b[0msc\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;31m# create a temporary directory\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mNameError\u001b[0m: name 'sc' is not defined" + ] + } + ], + "source": [ + "import git\n", + "import os\n", + "import tempfile\n", + "import shutil\n", + "import zipfile \n", + "\n", + "# create a temporary directory\n", + "tmpdir_path = tempfile.mkdtemp()\n", + "if debug: print \"temporary directory: %s\\n\" % tmpdir_path\n", + "\n", + "# ensure file is read/write by creator only\n", + "saved_umask = os.umask(0077)\n", + "\n", + "# create a zipfile handler to zip the necessary files\n", + "ziptmpdir_path = tempfile.mkdtemp()\n", + "if debug: print \"temporary directory for zip file: %s\\n\" % ziptmpdir_path\n", + "zipfile_path = ziptmpdir_path + \"/hermes_src.zip\"\n", + "if debug: print \"zip file's path: %s\\n\" % zipfile_path\n", + "zipfile_handler = zipfile.PyZipFile(zipfile_path, \"w\")\n", + "\n", + "# make zipfile handler verbose for debugging\n", + "zipfile_handler.debug = 3\n", + "\n", + "try:\n", + " # clone \"framework\" branch from GitHub into temporary directory\n", + " local_branch = git.Repo.clone_from(remote_url, tmpdir_path, branch=remote_branch)\n", + " if debug: print \"current branch: %s\\n\" % local_branch.head.ref\n", + " if debug: print \"list all in %s:\" % tmpdir_path; _list_all_in_dir(tmpdir_path); print \"\\n\"\n", + " \n", + " # zip \"hermes\" directory\n", + " if debug: print \"zipping: %s\\n\" % os.path.join(tmpdir_path, source_dir)\n", + " _zip_dir(os.path.join(tmpdir_path, source_dir), zipfile_handler)\n", + " \n", + " # check zip file\n", + " if debug: print \"Is zip file %s valid? %s\\n\" % (zipfile_path, zipfile.is_zipfile(zipfile_path))\n", + " \n", + " # add zip to SparkContext \n", + " # note: you can only add zip to SparkContext one time\n", + " if not _add_zipfile_to_sc.has_been_called:\n", + " if debug: print \"add zip file %s into spark context\\n\" % zipfile_path\n", + " _add_zipfile_to_sc(zipfile_path)\n", + " else:\n", + " if debug: print \"zip file %s is already added into spark context; will not re-add\\n\" % zipfile_path\n", + " \n", + "except IOError as e:\n", + " raise e\n", + "else:\n", + " os.remove(zipfile_path)\n", + "finally:\n", + " os.umask(saved_umask)\n", + " shutil.rmtree(tmpdir_path)\n", + " shutil.rmtree(ziptmpdir_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1\n", + "* Run movielens_10m_ratings with **ratings** vector transformation\n", + "* Implement **ALS** recommender system algorithms\n", + "* Implement **RMSE, MAE** metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Framework is based on a state machine. Since you are using a notebook, it is unlikely that you will use a state machine to automate the process, but you can use parts of the state machine to do what you need to do." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: __start()\n", + "**For those who use [MovieLens 1M CF test src code](http://l41-srv-mcdh32.b.internal:8880/notebooks/Hermes/MovieLens%201M%20CF%20test%20src%20code.ipynb#) as guidance, this is executing the pre-requisites when the HDFS directory and the input data are not defined yet.**\n", + "\n", + "Function: \n", + "* __start() creates the HDFS directory and uploads the input data. \n", + "* __start() implements the start_state of the state machine.\n", + "\n", + "```bash\n", + "\n", + "def __start(cargo):\n", + " \"\"\"start_state without the state machine.\"\"\"\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"In start_state:\")\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"Creating the hdfs directory \" + cargo.hdfs_dir)\n", + " os.system(\"hdfs dfs -mkdir \" + cargo.hdfs_dir)\n", + "\n", + " def load_json_files(datas):\n", + " for i in range(0, len(datas)):\n", + " json_path = datas[i].datapath\n", + " if Globals.verbose: Globals.logger.debug(\"Loading JSON file \" + json_path + \" into hdfs directory \" + cargo.hdfs_dir)\n", + " os.system(\"hdfs dfs -put \" + json_path + \" \" + cargo.hdfs_dir + \"/\" + os.path.basename(json_path))\n", + "\n", + " load_json_files(cargo.datas)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/tiffanyj/datasets/movielens/movielens_1m_movies.json.gz\n", + "/datasets/movielens/1m/movielens_1m_movies.json.gz\n", + "/home/tiffanyj/datasets/movielens/movielens_1m_ratings.json.gz\n", + "/datasets/movielens/1m/movielens_1m_ratings.json.gz\n" + ] + } + ], + "source": [ + "import os\n", + "hdfs_dir = \"/datasets/movielens/1m\"\n", + "movies_json_path = \"/home/tiffanyj/datasets/movielens/movielens_1m_movies.json.gz\"\n", + "movies_json_path_in_hdfs = hdfs_dir + \"/\" + os.path.basename(movies_json_path)\n", + "ratings_json_path = \"/home/tiffanyj/datasets/movielens/movielens_1m_ratings.json.gz\"\n", + "ratings_json_path_in_hdfs = hdfs_dir + \"/\" + os.path.basename(ratings_json_path)\n", + "\n", + "print movies_json_path\n", + "print movies_json_path_in_hdfs\n", + "print ratings_json_path \n", + "print ratings_json_path_in_hdfs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 1: You implement what is already in __start() manually yourself" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "256" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "# create hdfs_dir \n", + "os.system(\"hdfs dfs -mkdir \" + hdfs_dir)\n", + "# put json located at json_path into hdfs_dir\n", + "os.system(\"hdfs dfs -put \" + ratings_json_path + \" \" + ratings_json_path_in_hdfs)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ImportError", + "evalue": "No module named hermes", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mhermes\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmodules\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;31m# define Data (ie. UserVectorData) which is a class wrapper of the json\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;31m# and will be used to create a Vector (ie. UserVector)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mImportError\u001b[0m: No module named hermes" + ] + } + ], + "source": [ + "from hermes import *\n", + "import modules.data\n", + "\n", + "# define Data (ie. UserVectorData) which is a class wrapper of the json \n", + "# and will be used to create a Vector (ie. UserVector)\n", + "datapath = ratings_json_path\n", + "vector_transformation = \"ratings\"\n", + "schemapath = None\n", + "dataname = \"movielens\"\n", + "\n", + "uservectordata = modules.data.UserVectorData(datapath, vector_transformation, schemapath, dataname)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 2: You execute using the __start() function" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# import hermes where __start() function is defined\n", + "from hermes import *\n", + "# import cargo where Cargo class is defined\n", + "import modules.cargo\n", + "# import data where configuration is defined\n", + "import modules.data\n", + "\n", + "# create cargo\n", + "cargo = modules.cargo.Cargo()\n", + "\n", + "# add items to cargo\n", + "cargo.hdfs_dir = hdfs_dir\n", + "\n", + "# define Data and put it in cargo\n", + "dataname = \"movielens\"\n", + "datapath = ratings_json_path\n", + "vector_transformation = \"ratings\"\n", + "schemapath = None\n", + "uservectordata = modules.data.UserVectorData(datapath, vector_transformation, schemapath, dataname)\n", + "cargo.datas.append(uservectordata)\n", + "\n", + "# call the start function\n", + "hermes.__start(cargo)\n", + "\n", + "uservectordata = cargo.datas[0]\n", + "uservectordata.cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: __json_to_rdd()\n", + "**For those who use [MovieLens 1M CF test src code](http://l41-srv-mcdh32.b.internal:8880/notebooks/Hermes/MovieLens%201M%20CF%20test%20src%20code.ipynb#) as guidance, this is accomplishing cell # 5, 6, 7.**\n", + "\n", + "Function: \n", + "* __json_to_rdd() parses JSON to RDD. \n", + "* __json_to_rdd() implements the json_to_rdd state of the state machine.\n", + "\n", + "```bash\n", + " \"\"\"json_to_rdd_state without the state macine.\"\"\"\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"In json_to_rdd_state:\")\n", + "\n", + " # create RDD for each JSON file and store it in Cargo's vectors list\n", + " for i in range(0, len(cargo.datas)):\n", + " data = cargo.datas[i]\n", + " if Globals.verbose: Globals.logger.debug(\"Working with json file %s\" % data.datapath)\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"Creating dataframe based on the content of the json file\")\n", + " datapath_in_hdfs = \"hdfs://\" + cargo.fs_default_ip_addr + \"/\" + cargo.hdfs_dir + \"/\" + os.path.basename(data.datapath)\n", + " data.set_dataframe(Globals.scsingleton.sc, Globals.scsingleton.sqlCtx, datapath_in_hdfs)\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"Creating RDD based on the computed dataframe and configuration provided by the user\")\n", + " cargo.vectors.append( vg.VectorFactory().create_obj_vector(data, cargo.support_files) ) \n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 1: You implement what is already in __json_to_rdd() manually yourself\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import modules.data\n", + "import modules.vectorgenerator\n", + "\n", + "# convert JSON to Dataframe\n", + "uservectordata.set_dataframe(sc, sqlCtx, ratings_json_path_in_hdfs) \n", + "ratings = uservectordata.dataframe # extracting dataframe variable from UserVectorData class\n", + "\n", + "# this is the same thing as \n", + "# ratings = sqlCtx.read.json(\"hdfs://\" + ratings_json_path_in_hdfs)\n", + "# ratings.repartition(sc.defaultParallelism * 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import modules.vectorgenerator\n", + "import modules.vg\n", + "\n", + "# support_files is a dictionary that you can pass in during vector creation \n", + "support_files = {}\n", + "\n", + "# convert DataFrame to RDD\n", + "mv = modules.vectorgenerator.VectorFactory().create_obj_vector(uservectordata, None, True) \n", + "all_user_ratings = mv.vector\n", + "\n", + "# this is the same thing as \n", + "# mv = movieLens_vectorize.movieLens_vectorize(ratings, None, \"ratings\", \"none\")\n", + "# all_user_ratings = mv.get_user_vector()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print type(all_user_ratings)\n", + "all_user_ratings.take(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 2: You execute using the __json_to_rdd() function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "\n", + "cargo.fs_default_ip_addr = \"\"\n", + "cargo.hdfs_dir = hdfs_dir[1:]\n", + "cargo.support_files = {}\n", + "\n", + "# call json_to_rdd function\n", + "hermes.__json_to_rdd(cargo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mv = cargo.vectors[0]\n", + "all_user_ratings = mv.vector\n", + "print type(all_user_ratings)\n", + "all_user_ratings.take(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: __split_data()\n", + "**For those who use [MovieLens 1M CF test src code](http://l41-srv-mcdh32.b.internal:8880/notebooks/Hermes/MovieLens%201M%20CF%20test%20src%20code.ipynb#) as guidance, this is accomplishing cell # 8, 9.**\n", + "\n", + "Function: \n", + "* __split_data() splits data to train, test, and (optional) validate. \n", + "* __split_data() implements the split_data_state of the state machine.\n", + "\n", + "```bash\n", + "def __split_data(cargo):\n", + " \"\"\"split_data_state without the state machine.\"\"\"\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"In split_data_state:\")\n", + "\n", + " for i in range(0, len(cargo.vectors)):\n", + " vector = cargo.vectors[i]\n", + " weights, seed = hermesui._ask_user_for_split_percentage(vector.data.datapath)\n", + " vector.split_data(weights, seed)\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "trainingPercentage = 60/100.\n", + "testPercentage = 40/100.\n", + "validationPercentage = 0/100.\n", + "seed = 11" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 1: You implement what is already in __split_data() manually yourself" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "uservector = mv\n", + "\n", + "uservector.split_data([trainingPercentage, testPercentage, validationPercentage], seed)\n", + "train_ratings = uservector.training_vector\n", + "test_ratings = uservector.test_vector\n", + "validation_ratings = uservector.validation_vector\n", + "\n", + "# this is the same thing as\n", + "# train_ratings, test_ratings = uservector.vector.randomSplit([0.6, 0.4], 11)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "train_ratings.cache()\n", + "test_ratings.cache()\n", + "validation_ratings.cache()\n", + "\n", + "print train_ratings.count(), test_ratings.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 2: you execute using the __split_data() function" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n# TODO: will implement later\\n'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from hermes import *\n", + "\n", + "# call split_data function\n", + "hermes.__split_data(cargo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mv = cargo.vectors[0]\n", + "train_ratings = mv.training_vector\n", + "test_ratings = mv.test_vector\n", + "validation_ratings = mv.validation_vector\n", + "print train_ratings.count(), test_ratings.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: __make_prediction()\n", + "**For those who use [MovieLens 1M CF test src code](http://l41-srv-mcdh32.b.internal:8880/notebooks/Hermes/MovieLens%201M%20CF%20test%20src%20code.ipynb#) as guidance, this is accomplishing cell # 10.**\n", + "\n", + "Function: \n", + "* __make_prediction() develop model based on the train data and make prediction based on this model. \n", + "* __make_prediction() implements the make_prediction_state of the state machine.\n", + "\n", + "```bash\n", + "def __make_prediction(cargo):\n", + " \"\"\"make_prediction_state without the state machine.\"\"\"\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"In make_prediction_state:\") \n", + "\n", + " for i in range(0, len(cargo.vectors)):\n", + " thisvector = cargo.vectors[i]\n", + "\n", + " # select which recommenders based on the vector type\n", + " recommenders = None\n", + " thisvector_uservector = None\n", + " thisvector_contentvector = None\n", + " if helper.is_direct_subclass(thisvector, vg.UserVector):\n", + " if Globals.verbose: Globals.logger.debug(\"Iterating through recommenders for user vector on data %s\", thisvector.data.datapath)\n", + " thisvector_uservector = thisvector\n", + " recommenders = cargo.user_recommenders\n", + " elif helper.is_direct_subclass(thisvector, vg.ContentVector):\n", + " if Globals.verbose: Globals.logger.debug(\"Iterating through recommenders for content vector on data %s\", thisvector.data.datapath)\n", + " thisvector_contentvector = thisvector\n", + " thisvector_uservector = thisvector.uservector\n", + " recommenders = cargo.content_recommenders\n", + "\n", + " # run all recommenders on the vector\n", + " for r in recommenders:\n", + " if Globals.verbose: Globals.logger.debug(\"Making recommendation %s on data %s\", r, thisvector.data.datapath)\n", + " # TODO: implement other use case, ie. WithTfidf(), etc.\n", + " recommender = rg.RecommenderFactory().create_obj_recommender(r, thisvector_uservector, thisvector_contentvector)\n", + " # default use case\n", + " # recommender = RecommenderFactory().create_obj_recommender(r, vector, Default())\n", + " # with tf-idf use case \n", + " # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithTfidf())\n", + " # without tf-idf use case\n", + " # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithoutTfidf())\n", + " # etc.\n", + " with Timer() as t:\n", + " prediction_vector = recommender.make_prediction()\n", + " if Globals.verbose: Globals.logger.debug(\"Making prediction takes %s seconds\" % t.secs)\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 1: You implement what is already in __make_prediciton() manually yourself" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import modules.recommendergenerator\n", + "\n", + "# create recommender object with the default use case\n", + "recommender_str = \"ALS\"\n", + "recommender = modules.recommendergenerator.RecommenderFactory().create_obj_recommender(recommender_str, uservector)\n", + "# or\n", + "# modules.recommendergenerator.RecommenderFactory().create_obj_recommender(recommender, uservector, Default())\n", + "\n", + "# get the prediction vector\n", + "prediction_vector = recommender.make_prediction()\n", + "# or\n", + "# prediction_vector = uservector.prediction\n", + "predicted1 = prediction_vector" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "[Rating(user=36455, product=12, rating=3.1620100630939234),\n", + " Rating(user=13019, product=12, rating=3.009068937170033),\n", + " Rating(user=1199, product=12, rating=1.889880680902047),\n", + " Rating(user=56039, product=12, rating=1.8340114917394583),\n", + " Rating(user=68279, product=12, rating=2.575869762437719)]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prediction_vector.cache()\n", + "predicted1.cache()\n", + "\n", + "print type(predicted1)\n", + "predicted1.take(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import algorithms.cf\n", + "\n", + "# instead of doing the step above, you can also call the function directly\n", + "prediction_vector = algorithms.cf.calc_cf_mllib(uservector.training_vector)\n", + "predicted2 = prediction_vector" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "[Rating(user=22502, product=12, rating=2.145246574980865),\n", + " Rating(user=22514, product=12, rating=1.8239622809024438),\n", + " Rating(user=22526, product=12, rating=1.6218700820020784),\n", + " Rating(user=22538, product=12, rating=3.22630662094852),\n", + " Rating(user=22550, product=12, rating=2.568704193724831)]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print type(predicted2)\n", + "predicted2.take(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# both ways are the same thing as\n", + "# predicted = algorithms.cf.calc_cf_mllib(uservector.training_vector)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 2: you execute using the __make_prediction() function" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n# TODO: will implement later\\n'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from hermes import *\n", + "\n", + "cargo.user_recommenders = [\"ALS\"]\n", + "cargo.content_recommenders = []\n", + "\n", + "# call make_prediction function\n", + "hermes.__make_prediction(cargo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mv = cargo.vectors[0]\n", + "prediction_vector = mv.prediction_vector\n", + "print type(prediction_vector)\n", + "prediction_vector.take(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5: __calculate_metrics()\n", + "**For those who use [MovieLens 1M CF test src code](http://l41-srv-mcdh32.b.internal:8880/notebooks/Hermes/MovieLens%201M%20CF%20test%20src%20code.ipynb#) as guidance, this is accomplishing cell # 11.**\n", + "\n", + "Function: \n", + "* __calculate_metrics() tests the metrics specified by the user. \n", + "* __calculate_metrics() implements the calculate_metrics_state of the state machine.\n", + "\n", + "```bash\n", + "def __calculate_metrics(cargo):\n", + " \"\"\"calculate_metrics_state without the state machine.\"\"\"\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"In calculate_metrics_state:\")\n", + "\n", + " # create a metric executor\n", + " executor = mg.MetricExecutor(mg.Metric())\n", + "\n", + " for i in range(0, len(cargo.vectors)):\n", + " Globals.logger.info(\"-\" * 80)\n", + " Globals.logger.info(\"Data: %s\" % cargo.vectors[i].data.datapath)\n", + " for m in cargo.metrics:\n", + " # check if metric exists\n", + " metric = mg.MetricFactory().create_obj_metric(m)\n", + " # set metric in executor\n", + " executor.change_metric(metric)\n", + " # execute the metric\n", + " with Timer() as t:\n", + " Globals.logger.info(\"Metric: %s = %f\" % (m, executor.execute(cargo.vectors[i])))\n", + " if Globals.verbose: Globals.logger.debug(\"Calculating metric takes %s seconds\" % t.secs)\n", + " Globals.logger.info(\"-\" * 80)\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 1: You implement what is already in __calculate_metrics() manually yourself" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import modules.metricgenerator \n", + "\n", + "# create metric executor\n", + "executor = modules.metricgenerator.MetricExecutor(modules.metricgenerator.Metric())\n", + "\n", + "# create metric object\n", + "metric_str = \"RMSE\"\n", + "rmse_metric = modules.metricgenerator.MetricFactory().create_obj_metric(metric_str)\n", + "\n", + "# set metric in executor \n", + "executor.change_metric(rmse_metric)\n", + "\n", + "# calculate metric\n", + "rmse = executor.execute(uservector)\n", + "\n", + "print \"RMSE: \", rmse\n", + "\n", + "# switch metric object\n", + "metric_str = \"MAE\"\n", + "mae_metric = modules.metricgenerator.MetricFactory().create_obj_metric(metric_str)\n", + "executor.change_metric(mae_metric)\n", + "\n", + "# calculate metric\n", + "mae = executor.execute(uservector)\n", + "\n", + "print \"MAE: \", mae\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import algorithms.performance_metrics\n", + "\n", + "# instead of doing the step above, you can also call the function directly\n", + "rmse = algorithms.performance_metrics.calculate_rmse(uservector.test_vector, uservector.prediction_vector)\n", + "print \"RMSE: \", rmse\n", + "\n", + "mae = algorithms.performance_metrics.calculate_mae(uservector.test_vector, uservector.prediction_vector)\n", + "print \"MAE: \", mae" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# both ways are the same thing as\n", + "# rmse = algorithms.performance_metrics.calculate_rmse(uservector.test_vector, uservector.prediction_vector)\n", + "# mae = algorithms.performance_metrics.calculate_mae(uservector.test_vector, uservector.prediction_vector)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 2: you execute using the __calculate_metrics() function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "\n", + "cargo.metrics = [\"ALS\"]\n", + "\n", + "# call calculate_metrics function\n", + "hermes.__calculate_metrics(cargo)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From f60cd15c880ffdd7c66cbd39141f8c0638b2b424 Mon Sep 17 00:00:00 2001 From: tiffanyj41 Date: Mon, 25 Jan 2016 12:29:29 -0800 Subject: [PATCH 35/39] Update using_notebook.md --- docs/using_notebook.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/using_notebook.md b/docs/using_notebook.md index bf156d4..9a413d9 100644 --- a/docs/using_notebook.md +++ b/docs/using_notebook.md @@ -4,12 +4,14 @@ 2. Launch Anaconda luncher 3. Launch ipython-notebook 4. Create an iPython profile for use with PySpark + ```bash - ipython profile create pyspark +ipython profile create pyspark ``` 5. Create a iPython notebook configuration + ```bash - vim ~/.ipython/profile_pyspark/ipython_notebook_config.py +vim ~/.ipython/profile_pyspark/ipython_notebook_config.py ``` ```bash c = get_config() From 7e33cb13adeb636691757e9891acf58359d14a99 Mon Sep 17 00:00:00 2001 From: tiffanyj41 Date: Mon, 25 Jan 2016 12:43:57 -0800 Subject: [PATCH 36/39] Update installation.md --- docs/installation.md | 58 ++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/docs/installation.md b/docs/installation.md index f8c8bc7..7769da9 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -13,47 +13,47 @@ 1. Download 2. Double click on .dmg file to install. 3. In a terminal, type java -version. You should see the following: -```bash -java version "1.8.0_65" -Java(TM) SE Runtime Environment (build 1.8.0_65-b17) -Java HotSpot(TM) 64-Bit Server VM (build 25.65-b01, mixed mode) -``` + ```bash + java version "1.8.0_65" + Java(TM) SE Runtime Environment (build 1.8.0_65-b17) + Java HotSpot(TM) 64-Bit Server VM (build 25.65-b01, mixed mode) + ``` 2. Set JAVA_HOME -```bash -export JAVA_HOME=$(/usr/libexec/java_home) -``` + ```bash + export JAVA_HOME=$(/usr/libexec/java_home) + ``` 3. Install Homebrew -```bash -$ ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" -``` + ```bash + $ ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" + ``` 4. Install Scala -``` -$ brew install scala -``` + ``` + $ brew install scala + ``` 5. Download Spark from https://spark.apache.org/downloads.html. 6. Set SCALA_HOME and SPARK_HOME and export it to path in your .bash_profile. -```bash -export SPARK_HOME=/path/to/your/spark -export PATH=$PATH:$SPARK_HOME/bin -export SCALA_HOME=/path/to/your/scala -export PATH=$PATH:$SCALA_HOME/bin -``` + ```bash + export SPARK_HOME=/path/to/your/spark + export PATH=$PATH:$SPARK_HOME/bin + export SCALA_HOME=/path/to/your/scala + export PATH=$PATH:$SCALA_HOME/bin + ``` 7. Export PySpark classes to the Python path after you have installed Python. -```bash -export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH -``` + ```bash + export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH + ``` 8. Build and install Apache Spark -```bash -$ brew install sbt -$ cd $SPARK_HOME -$ sbt/sbt clean assembly -``` + ```bash + $ brew install sbt + $ cd $SPARK_HOME + $ sbt/sbt clean assembly + ``` ### Installing Hadoop Please follow this [guide](http://zhongyaonan.com/hadoop-tutorial/setting-up-hadoop-2-6-on-mac-osx-yosemite.html). @@ -86,4 +86,4 @@ $ pip install --editable . Now, you can just run hermes the binary and it will prompt you with what you want to do with the data that you have. ```bash $ hermes -``` \ No newline at end of file +``` From cd193ba41a034659a2ce753a04d92aa93f510cb1 Mon Sep 17 00:00:00 2001 From: tiffanyj41 Date: Mon, 25 Jan 2016 12:52:21 -0800 Subject: [PATCH 37/39] Update using_notebook.md --- docs/using_notebook.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/using_notebook.md b/docs/using_notebook.md index 9a413d9..e7e0afa 100644 --- a/docs/using_notebook.md +++ b/docs/using_notebook.md @@ -4,7 +4,26 @@ 2. Launch Anaconda luncher 3. Launch ipython-notebook 4. Create an iPython profile for use with PySpark + 1. Make sure you have exported PySpark classes to your python path and build Apache Spark. + To export PySpark classes, add the following to your ~/.bash_profile: + ```bash + # export spark to path + export SPARK_HOME=/path/to/your/spark + export PATH=$PATH:$SPARK_HOME/bin + # export pyspark classes to the python path + export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH + # export py4j to the python path + export PYTHONPATH=$SPARK_HOME/python/lib/py4j--src.zip:$PYTHONPATH + ``` + + 2. Build Apache Spark + + ```bash + $ cd $SPARK_HOME + $ sbt/sbt clean assembly + ``` + ```bash ipython profile create pyspark ``` From cdc2e65235287ea889d518b71938bb8ac8d17d72 Mon Sep 17 00:00:00 2001 From: tiffanyj41 Date: Thu, 28 Jan 2016 14:28:17 -0800 Subject: [PATCH 38/39] Update using_notebook.md --- docs/using_notebook.md | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/docs/using_notebook.md b/docs/using_notebook.md index e7e0afa..f71841e 100644 --- a/docs/using_notebook.md +++ b/docs/using_notebook.md @@ -1,8 +1,8 @@ # How to use iPython Notebook 1. Install Anaconda -2. Launch Anaconda luncher -3. Launch ipython-notebook +2. Launch Anaconda launcher +3. Launch ipython-notebook 4. Create an iPython profile for use with PySpark 1. Make sure you have exported PySpark classes to your python path and build Apache Spark. To export PySpark classes, add the following to your ~/.bash_profile: @@ -34,9 +34,21 @@ vim ~/.ipython/profile_pyspark/ipython_notebook_config.py ``` ```bash c = get_config() - c.NotebookApp.ip = '*' - c.NotebookApp.open_browser = False - c.NotebookApp.port = 8880 + + # kernel configuration + c.IPKernelApp.pylab = 'inline' # set %matplotlib inline always + + # notebook configuration + c.NotebookApp.ip = '*' # '*' == to bind on all IPs + # do not open the browser at start of ipython notebook + # so that we can point the ipython notebook address + # in an active web browser + c.NotebookApp.open_browser = False + + # (optional) you can add password to your notebook if desired + + # set a fixed port number that does not conflict with other iPython profiles + c.NotebookApp.port = 8880 ``` 6. Create PySpark Setup configuration ```bash @@ -50,6 +62,10 @@ vim ~/.ipython/profile_pyspark/ipython_notebook_config.py # setup spark home findspark.init() spark_home = findspark.find() + + # setup spark home approach #2 + # make sure you have already set $SPARK_HOME in $PATH + # spark_home = os.environ.get('SPARK_HOME', None) # add spark's home directory to path sys.path.insert(0, os.path.join(spark_home, "python")) @@ -72,8 +88,11 @@ vim ~/.ipython/profile_pyspark/ipython_notebook_config.py from pyspark.sql import SQLContext # setup SparkContext - sc = SparkContext._active_spark_context - + try: + sc = SparkContext() + except: + sc = SparkContext._active_spark_context + # setup SQLContext sqlCtx = SQLContext(sc) ``` From dad25dbbb99efeb0ad061a82a1dee050e234d170 Mon Sep 17 00:00:00 2001 From: tiffanyj41 Date: Wed, 16 Mar 2016 15:18:28 -0700 Subject: [PATCH 39/39] Update README.md added a warning that the team no longer pursues the command line path --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 591e539..33e2ae9 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,9 @@ For a detailed installation guide, please read on [Hermes Installation Guide](ht * Hadoop 2.7.1 * virtualenv +### Warning: +We have dropped working on Hermes for the command line because the team has decided to pursue running Hermes on the Spark's iPython Notebook instead. + ### How to Install Hermes: (Optional) After you have installed the dependencies, if you have different projects that require different Python environment, you can use a Virtual Environment. As listed in the Virtual Environment's [site](http://docs.python-guide.org/en/latest/dev/virtualenvs/), "a Virtual Environment is a tool to keep the dependencies required by different projects in separate places, by creating virtual Python environments for them."