diff --git a/.gitignore b/.gitignore index 3270149..c1f7ebd 100644 --- a/.gitignore +++ b/.gitignore @@ -112,7 +112,7 @@ lib64 __pycache__ # The __init__.py's that scram puts everywhere -__init__.py +# __init__.py # Installer logs pip-log.txt diff --git a/README.md b/README.md index 930925b..33e2ae9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,76 @@ -# hermes +# Hermes + Hermes is Lab41's foray into recommender systems. It explores how to choose a recommender system for a new application by analyzing the performance of multiple recommender system algorithms on a variety of datasets. -It also explores how recommender systems may assist a software developer of data scientist find new data, tools, and computer programs. +It also explores how recommender systems may assist a software developer or a data scientist to find new data, tools, and computer programs. + +This readme will be updated as the project progresses so stay tuned! + + +## Documentation + +[Hermes Documentation](https://github.com/Lab41/hermes/tree/master/docs) + + +## Basic Installation Guide + +For a detailed installation guide, please read on [Hermes Installation Guide](https://github.com/Lab41/hermes/tree/master/docs/installation.md). + +### Dependencies: +* Spark 1.5.1 +* Scala 2.11.7 +* Pyspark 0.8.2.1 +* Hadoop 2.7.1 +* virtualenv + +### Warning: +We have dropped working on Hermes for the command line because the team has decided to pursue running Hermes on the Spark's iPython Notebook instead. + +### How to Install Hermes: + +(Optional) After you have installed the dependencies, if you have different projects that require different Python environment, you can use a Virtual Environment. As listed in the Virtual Environment's [site](http://docs.python-guide.org/en/latest/dev/virtualenvs/), "a Virtual Environment is a tool to keep the dependencies required by different projects in separate places, by creating virtual Python environments for them." + +```bash +$ virtualenv name_of_your_virtualenv +$ . name_of_your_virtualenv/bin/activate +``` + +To install Hermes, run +```bash +$ python setup.py install +``` + +This will create a binary called hermes in /usr/local/bin/hermes. Instead of running the binary with the entire path (ie. ./usr/local/bin/hermes), you can install it so that you can run hermes without calling the entire path on the command line. +```bash +$ pip install --editable . +``` + +Now, you can just run hermes the binary and it will prompt you with what you want to do with the data that you have. +```bash +$ hermes +``` + +## How to Run Hermes + +NOTE: Next implementation of Hermes will be set up so that it does not use pseudo-distributed mode in a single node cluster. + +For a detailed guide on how to run Hermes, please read on [How to Run Hermes](https://github.com/Lab41/hermes/tree/master/docs/run.md) guide. + +Hermes requires at least three arguments in order to run properly. +* fs_default_ip_addr: IP address of fs.default.name used in HDFS, ie. localhost:9000. +* list_of_files_config: A configuration file that lists all the json paths referenced by configs. +* configs: Users can provide an unlimited amount of configuration files that list what datasets to use and which recommender algorithms and metrics to apply to each dataset. + +With one configuration file: +```bash +$ hermes localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini +``` + +With more than one configuration files: +```bash +$ hermes localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini ./hermes/configs/config2.ini +``` + +## State of Build -This readme will be updated as the project progresses so stay tuned! \ No newline at end of file +It is currently in progress. We will show the progress of the build using TravisCI once it is established. diff --git a/docs/assumptions.md b/docs/assumptions.md new file mode 100644 index 0000000..c0685c7 --- /dev/null +++ b/docs/assumptions.md @@ -0,0 +1,70 @@ +# Assumptions + +* [Assumptions on Execution](#assumptions-on-execution) +* [Assumptions on Vector Creation](#assumptions-on-vector-creation) +* [Assumptions on Directory Creation](#assumptions-on-directory-creation) + +## Assumptions on Execution + +Here is an example file called config.ini. + +```bash +[datasets] +dataname = movielens + +# user vector +user_vector_data = ["movielens_10m_ratings", "movielens_20m_ratings"] +user_vector_schemas = ["movielens_10m_ratings_schema", "movielens_20m_ratings_schema"] +user_vector_transformations = ["ratings", "ratings_to_interact"] + +# content vector +content_vector_data = ["movielens_10m_movies"] +content_vector_schema = ["movielens_10m_movies_schema"] +content_vector_transformations = ["genre"] + +[recommenders] +user_recommenders = ["ALS"] +content_recommenders = ["CBWithKMeans"] + +[metrics] +metrics = ["RMSE", "MAE"] +``` + +When you specify the following configuration, the assumption that we make during execution is as follows: +* each transformation is applied in sequential order to the data, meaning + * user_vector_transformation "ratings" is applied to "movielens_10m_ratings" and "movielens_10m_ratings_schema" + * user_vector_transformation "ratings_to_interact" is applied to "movielens_20m_ratings" and "movielens_20m_ratings_schema" + * content_vector_transformation "genre" is applied to "movielens_10m_movies" and "movielens10m_movies_schema" +* user_recommenders take in a list of recommender algorithms that will be applied to all user_vector_data, meaning + * apply ALS to a User Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings" + * apply ALS to a User Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings_to_interact" +* content_recommenders take in a list of recommender algorithms that will be applied to all content_vector_data, meaning + * apply CBWithKMeans to a Content Vector of movielens_10m_movies that have been transformed by vector transformation "genre" +* metrics take in a list of metrics that will be applied to all data, including both user_vector_data and content_vector_data, after recommender algorithms have been applied to them, meaning + * apply RMSE to a User Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings" and recommendation system algorithm ALS + * apply RMSE to a USer Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings_to_interact" and recommedation systme algorithm ALS + * apply RMSE to a Content Vector of movielens_10m_movies that have been transformed by vector transformation "genre" and recommendationi system algorithm CBWithKMeans + * apply MAE to a User Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings" and recommendation system algorithm ALS + * apply MAE to a USer Vector of movielens_10m_ratings that have been transformed by vector transformation "ratings_to_interact" and recommedation systme algorithm ALS + * apply MAE to a Content Vector of movielens_10m_movies that have been transformed by vector transformation "genre" and recommendationi system algorithm CBWithKMeans + +## Assumptions on Vector Creation + +Each dataset is unique in that transforming JSON to RDD is different for each dataset. This step is implemented in vectorgenerator.py. When we separate the implementation of vector generation of each dataset into individual files in the hermes/hermes/modules/vectors directory, each of these files need to import vectorgenerator.py in this specific manner: + +```bash +from hermes.modules.vectorgenerator import UserVector, ContentVector +``` + +The reason for this is during the instantiation of the vector object in the VectorFactory class. When we specify which vector to create, it is either a UserVector or a ContentVector class; both of which are instantiated in vectorgenerator.py, and vectorgenerator.py as a module is hermes.modules.vectorgenerator. + +Since we can no longer use the __subclasses__() function to iterate through all children of UserVector class or all children of ContentVector class in order to instantiate the right vector because the children are now defined in a separate module in hermes/hermes/modules/vectors directory, we have to load all modules and go through each class in each module to know all children of a UserVector or ContentVector class. Unfortunately, if you defined the import statement as "from modules.vectorgenerator" instead of "from hermes.modules.vectorgenerator", it does not think the two modules are the same even though they are. + +We have yet to determine why this is the case. + +When users add a new dataset, we cannot always assume that they will import exactly as "from hermes.modules.vectorgenerator import UserVector, ContentVector" because they can import it as "from modules.vectorgenerator import UserVector, ContentVector" since it is valid. For this reason, we have made an assumption that if the parent class of the MovieLensUserVector, for example, has the __name__ UserVector, MovieLensUserVector is the child of UserVector. The problem of this assummption is that if MovieLensUserVector inherits multiple parents from different module with the same class name, it can become a problem as it will treat both parents with the same class name as the same. + + +## Assumptions on Directory Creation + +We made an assumption that there is only one directory with the label "vg", "rg", and "mg". These directories store the modules for vector, recommender, and metric creation specific to either datasets or use cases. The assumption is made in the helper function load_modules_in_zip() where it checks for the base directory of the file path if the base directory is "vg", "rg", or "mg" to load the modules in the notebook during vector, recommender, or metric creation respectively. diff --git a/docs/configs.md b/docs/configs.md new file mode 100644 index 0000000..8c69c77 --- /dev/null +++ b/docs/configs.md @@ -0,0 +1,246 @@ +# Hermes's Configuration Files Explained + +* [List of Files Standard](#list-of-files-standard) + * [Dataname](#dataname) + * [JSON Paths](#json-paths) +* [Configuration File Standard](#configuration-file-standard) + * [Datasets](#datasets) + * [Dataname](#dataname) + * [Vectors](#vectors) + * [Optional Variables: Schemas & Support Files](#optional-variables) +* [Recommenders](#recommenders) +* [Metrics](#metrics) + +Hermes requires at least two configuration files: +* **list_of_files_config**: A configuration file that lists all the json paths referenced by configs. +* **configs**: Users can provide an unlimited amount of configuration files that list what datasets to use and which recommender algorithms and metrics to apply to each dataset. + +Each configuration file requires it to follow a certain standard. These standards will be further explained below. + +Saved configuration files can be found in hermes/hermes/configs in case you want to run a previously saved configuration. + +Before continuing, it might be beneficial if you understand the Hermes's framework by reading this [guide](https://github.com/Lab41/hermes/tree/master/docs/framework.md) first. + +## List of Files Standard + +Let's take a look at an example file called list_of_files.ini. + +```bash +[movielens] +# 20M data +movielens_20m_ratings = /path/to/your/movielens/20m/ratings.json.gz +movielens_20m_tags = /path/to/your/movielens/20m/tags.json.gz +movielens_20m_movies = /path/to/your/movielens/20m/movies.json.gz + +# 10M data +movielens_10m_ratings = /path/to/your/movielens/10m/ratings.json.gz +movielens_10m_tags = /path/to/your/movielens/10m/tags.json.gz +movielens_10m_movies = /path/to/your/movielens/10m/movies.json.gz + +# 1M data +movielens_1m_ratings = /path/to/your/movielens/1m/ratings.json.gz +movielens_1m_tags = /path/to/your/movielens/1m/tags.json.gz +movielens_1m_movies = /path/to/your/movielens/1m/movies.json.gz + +# 20M schema +movielens_20m_ratings_schema = /path/to/your/movielens/20m/ratings_schema.json.gz +movielens_20m_tags_schema = /path/to/your/movielens/20m/tags_schema.json.gz +movielens_20m_movies_schema = /path/to/your/movielens/20m/movies_schema.json.gz + +# 10M schema +movielens_10m_ratings_schema = /path/to/your/movielens/10m/ratings_schema.json.gz +movielens_10m_tags_schema = /path/to/your/movielens/10m/tags_schema.json.gz +movielens_10m_movies_schema = /path/to/your/movielens/10m/movies_schema.json.gz + +# 1M schema +movielens_1m_ratings_schema = /path/to/your/movielens/1m/ratings_schema.json.gz +movielens_1m_tags_schema = /path/to/your/movielens/1m/tags.json_schema.gz +movielens_1m_movies_schema = /path/to/your/movielens/1m/movies_schema.json.gz +``` + +### Dataname + +A single data can be split into multiple JSON files. In this case, [movielens] is a data that is split into multiple JSON files. For lack of a better term, we call [movielens] a "dataname" variable. There can be multiple datanames in a list of files (ie. list_of_files.ini), but there can only be one dataname in a configuration file (ie. config.ini). + +Dataname plays an important role in that we know which data each JSON file is coming from. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance() for each data instantiated class. What is checked in isSameDataInstance() has to match the dataname exactly. If it did not, Hermes will throw an error message. + +For example, in the case of the Movie Lens data, its dataname is "movielens". The check in the class MovieLens's isSameDataInstance() function will check that dataname is equal to "movielens". If you passed [MovieLens] to list_of_files.ini, for example, and the check in isSameDataInstance() is "movielens", it will fail. However, if you passed [movielens] to list_of_files.ini and the check in isSameDataInstance() is "movielens", it will pass. + +### JSON Paths + +Underneath the dataname heading, each variable (ie. movielens_20m_ratings, movielens_20m_tags, etc.) is a shorthand name for a specific JSON file. These variables will store the path to their individual JSON file. They will be used in the configuration file (ie. config.ini) as input to user_vector_data and content_vector_data variable. + +## Configuration File Standard + +**If you wanted to know what data is currently supported by Hermes and the different ways you can parse the data (and how you can add your own data not yet supported), please checkout [List of Data Supported](https://github.com/Lab41/hermes/tree/master/docs/data_supported.md) guide.** + +**If you wanted to know what types of recommender system algorithms currently supported by Hermes (and how you can add different algorithms not yet supported), please check out [List of Recommender Systems Supported](https://github.com/Lab41/hermes/tree/master/docs/recommenders_supported.md) guide.** + +**If you wanted to know what types of metrics currently supported by Hermes (and how you can add different metrics not yet supported), please check out [List of Metrics Supported](https://github.com/Lab41/hermes/tree/master/docs/metrics_supported.md) guide.** + +Let's take a look at an example file called config.ini. + +```bash +[datasets] +dataname = movielens + +# user vector +user_vector_data = ["movielens_10m_ratings", "movielens_20m_ratings"] +user_vector_schemas = ["movielens_10m_ratings_schema", "movielens_20m_ratings_schema"] +user_vector_transformations = ["ratings", "ratings_to_interact"] + +# content vector +content_vector_data = ["movielens_10m_movies"] +content_vector_schema = ["movielens_10m_movies_schema"] +content_vector_transformations = ["genre"] + +[recommenders] +user_recommenders = ["ALS"] +content_recommenders = ["CBWithKMeans"] + +[metrics] +metrics = ["RMSE", "MAE"] +``` + +### Datasets + +Datasets specify which data we are going to use. It contains dataname, user or content vectors, and support files. + +#### Dataname + +One configuration file can specify only one dataname. Dataname is the name of the data where each JSON file is derived from. + +#### Vectors + +Vector is the transformed data that will be subjected to the recommender system algorithms and metrics. + +Understanding how a vector is created will provide an understanding of what a vector is. To create a vector, the steps are as follow: + +1. Read the configuration files to know what type of vectors we are creating. +2. Read each JSON file to obtain the data. The output of this step is the creation of a dataframe. +3. Once you have this dataframe, you can subject it to a transformation specified by the vector transformation. For example: if we wanted to create a user vector from the JSON file "movielens_10m_ratings" of vector tranformation "ratings" as specified by config.ini above, the data from the JSON file "movielens_10m_ratings" is transformed into a RDD of [(user_id, movie_id, rating)] because vector transformation "rating" converts MovieLens data into [(user_id, movie_id, rating)]. Different vector transformation will implement different transformation of the data. For vector transformation "ratings_to_interact", it will convert MovieLens data into [(user_id, movie_id, just_rating_greater_than_3)]. + +To wrap it up, vector refers to a dataframe that has been converted to a RDD after a transformation occurs. This transformation is specified by the vector tranformation. + +There are two types of vectors currently implemented: User Vector and Content Vector. User Vector refers to the vector describing users in the data. Content Vector refers to the vector describing the content in the data. + +Each vector requires the following to be specified in the configuration file: +* **user_vector_data** / **content_vector_data**: Vector data takes in a list of JSON names that reference the JSON path as specified in the list of files config (ie. list_of_files.ini). user_vector_data will create a User Vector; content_vector_data will create a Content Vector. +* **user_vector_transformations** / **content_vector_transformations**: user_vector_transformations and content_vector_transformations will take in a list of transformations to apply to user_vector_data and content_vector_data respectively. Note that user_vector_data and user_vector_transformations (as well as content_vector_data and content_vector_transformations) have a one-on-one relationship, meaning vector transformation at index 0 will be applied to vector data at index 0, vector transformation at index 1 will be applied to vector data at index 1, and vector transformation at index n will be applied to vector data at index n. Currently, Hermes does not have the ability to apply multiple transformations onto one vector data unless the vector data is specified multiple times in user_vector_data / content_vector_data with its respective vector transformation. + +#### Optional Variables: Schemas & Support Files + +Each vector can specify optional variables that can assist in process speed or vector transformation: +* **user_vector_schemas** / **content_vector_schemas**: Specifying a schema for each data can speed up the reading process of the JSON file. Again, user_vector_schemas and content_vector_schemas have a one-to-one relationship with user_vector_data and content_vector_data respectively, meaning user_vector_schemas at index 0 applies to user_vector_data at index 0; content_vector_schemas at index 0 applies to content_vector_data at index 0. +* **support_files**: Additional variables listed in the [datasets] section will be treated as support files. During the creation of a Vector, these support files will be passed in as a dictionary with the key as a variable and the value as the value received. Currently, it cannot take a list of values as its value. For example: if glove_model = /data/glove/glove.txt is an additional line listed under the [datasets] section, it will be passed in as a dictionary with glove_model as key and /data/glove/glove.txt as its value. + +### Recommenders + +user_recommenders take in a list of recommender algorithms that will be applied to all user_vector_data. + +content_recommenders take in a list of recommender algorithms that will be applied to all content_vector_data. + +### Metrics + +metrics take in a list of metrics that will be applied to all data, including both user_vector_data and content_vector_data, after recommender algorithms have been applied to them. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/data_supported.md b/docs/data_supported.md new file mode 100644 index 0000000..295a49f --- /dev/null +++ b/docs/data_supported.md @@ -0,0 +1,168 @@ +# Datasets Supported + +* [Movie Lens](#movielens) + * [Configuration Files](#movielens-configuration-files) + * [Vector Transformation for User Vector](#movielens-vector-transformation-for-user-vector) + * [Vector Transformation for Content Vector](#movielens-vector-transformation-for-content-vector) +* [Wikipedia](#wiki) + * [Configuration Files](#wiki-configuration-files) + * [Vector Transformation for User Vector](#wiki-vector-transformation-for-user-vector) + * [Vector Transformation for Content Vector](#wiki-vector-transformation-for-content-vector) +* [Adding New Datasets](#adding-new-datasets) + + +Hermes currently supports the following dataset: + +Dataset | Location +------------- | ------------- +MovieLens | http://grouplens.org/datasets/movielens/ +Wikipedia | https://en.wikipedia.org/wiki/Wikipedia:Database_download#English-language_Wikipedia + +Additional datasets will be added in the future. + +If you have datasets not currently supported by Hermes, please follow the instructions in [Adding Additional Datasets](#adding-additional-datasets) section below. + +Before continuing, it might be beneficial if you understand the Hermes's framework by reading this [guide](https://github.com/Lab41/hermes/tree/master/docs/framework.md) first. + +## Movie Lens + +### Configuration Files +For JSON files derived from Movie Lens data, you need to specify the following: +* In configuration file, specify dataname = movielens +* In configuraiton file that lists all JSON files, specify section as [movielens] + +As long as the dataname check matches with the dataname given in the configuration files, Hermes will recognize it as a Movie Lens data. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance(). What is checked in isSameDataInstance() has to match the dataname exactly. If it did not, Hermes will throw an error message. In this case, dataname has to match "movielens" exactly to recognize that this is a Movie Lens data. + +### Vector Transformation for User Vector + +You can specify the vector transformation on a user vector by specifying user_vector_transformations as one of the followings: + +* *ratings*: This vector transformation transforms the data into the format of [(user_id, movie_id, rating)]. +* *pos_ratings*: This vector transformation transforms the data into the format of [(user_id, movie_id, rating)] and filters out only ratings that are greater than 3. Meaning, this vector transformation will list all positive ratings where we assume a rating of 4 or 5 is a positive one. +* *ratings_to_interact*: This vector transformation transforms the data into the format of [(user_id, movie_id, binary_rating)] where binary_rating will return a value of -1 if it has a rating 2 or less and a value of 1 if it has a rating 3 or more. + +### Vector Transformation for Content Vector + +You can specify the vector transformation on a content vector by specifying content_vector_transformations as one of the followings: + +* *genre*: This vector transformation transforms the data into the format of [(movie_id, [genre_1, genre_2, ..., genre_n])]. Meaning, this vector transformation will list the genres of the movie. + +## Wikipedia + +### Configuration Files +For JSON files derived from Wikipedia data, you need to specify the following: +* In configuration file, specify dataname = wiki +* In configuration file that lists all JSON files, specify section as [wiki] + +As long as the dataname check matches with the dataname given in the configuration files, Hermes will recognize it as a Wikipedia data. This check can be found in hermes/hermes/modules/vectorgenerator.py under a class function called isSameDataInstance(). What is checked in isSameDataInstance() has to match the dataname exactly. If it did not, Hermes will throw an error message. In this case, dataname has to match "wiki" exactly to recognize that this is a Wikipedia data. + +#### Vector Transformation for User Vector + +You can specify the vector transformation on a user vector by specifying user_vector_transformations as one of the followings: + +* *num_edits*: This vector transformation transforms the data into the format of [(user_id, article_id, num_edits)] where num_edits counts the number of items a user modify an article. +* *any_interact*: This vector transformation transforms the data into the format of [(user_id, article_id, num_interact)] where num_interact shows the interaction the user has with an article. Even if the user edits the article more than once, this vector transformation considers the interaction the user has with the article as one. +* *num_edits_ceil*: This vector trasnformation transforms the data into the format of [(user_id, article_id, num_edits_with_ceiling)] where num_edits counts the number of items a user modify an article and selects the max between num_edits and 5. + +#### Vector Transformation for Content Vector + +You can specify the vector transformation on a content vector by specifying content_vector_transformations as one of the followings: + +* *glove*: Explanation will be provided once implemented. (TODO: in development) +* *category_map*: Explanation will be provided once implemented. (TODO: in development) + + +## Adding New Datasets + +This excerpt is taken out from [Understanding Hermes's Framework](https://github.com/Lab41/hermes/tree/master/docs/framework.md#adding-new-datasets). + +Every time you add a new dataset, you will need to create a new file in hermes/hermes/modules/vectors. The template for supporting an additional dataset is shown below.The template for supporting an additional dataset is shown below. + +Template: + +```bash +from hermes.modules.vectorgenerator import UserVector, ContentVector + +class NewDataset(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.dataname == "new_dataset_dataname_name" + +class NewDatasetUserVector(UserVector, NewDataset): + def user_vector_transformation_1(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) + + def user_vector_transformation_2(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)).filter(lambda (u, m, r): r > 3) + + def user_vector_transformation_n(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, -1 if row.rating < 3 else 1)) + +class NewDatasetContentVector(ContentVector, NewDataset): + def content_vector_transformation_1(self): + def internal_helper_function(row): + return np.array(( + int(row.genre_action), + int(row.genre_adventure), + int(row.genre_animation), + )) + return self.data.dataframe.map(lambda row: (row.movie_id, internal_helper_function(row))) + +``` + +1. Instantiate a class for your dataset. In this case, it is specified as class NewDataset. +2. Instantiate a User Vector and a Content Vector class for your dataset that inherits from your dataset class and UserVector or Content Vector respectively. In this case, the UserVector for NewDataset is called NewDataSetUserVector, and the ContentVector for NewDataset is called NewDataContentVector. +3. Provide the dataname name for the check in isSameDataInstance(). In this case, dataname is checked if it's equal to "new_dataset_dataname_name". +4. Provide the vector transformation logic for each type of vectors. For User Vector transformations, define the function in the class NewDatasetUserVector. In this case, these vector transformations are user_vector_transformation_1, user_vector_transformation_2, and user_vector_transformation_n. For Content Vector transformations, define the function in the class NewDatasetContentVector. In this case, the vector transformation is content_vector_trasnformation_1. +5. Additional support files needed for the vector transformation is passed down from the configuration file as self.support_files. self.support_files is a dictionary with the key as a variable and the value as the value received in the configuration file. Please read on the [configuration file guide](https://github.com/Lab41/hermes/tree/master/docs/configs.md#optional-variables) for more details. + +After you have defined the concrete implementation of the new dataset, you can now use the dataset and apply multiple recommender system algorithms and metrics. + +In list_of_files.ini: +```bash +[new_dataset_dataname_name] +new_dataset_10m_ratings = /path/to/your/new/dataset/10m/ratings.json.gz +new_dataset_20m_ratings = /path/to/your/new/dataset/20m/ratings.json.gz +new_dataset_10m_ratings_schema = /path/to/your/new/dataset/10m/ratings_schema.json.gz +new_dataset_20m_ratings_schema = /path/to/your/new/dataset/20m/ratings_schema.json.gz + +new_dataset_10m_movies = /path/to/your/new/dataset/10m/movies.json.gz +new_dataset_10m_movies_schema = /path/to/your/new/dataset/10m/movies_schema.json.gz +``` + +In new_dataset_config.ini: +```bash +[datasets] +dataname = new_dataset_dataname_name + +# user vector +user_vector_data = ["new_dataset_10m_ratings", "new_dataset_20m_ratings"] +user_vector_schemas = ["new_dataset_10m_ratings_schema", "new_dataset_20m_ratings_schema"] +user_vector_transformations = ["user_vector_transformation_1", "user_vector_transformation_2"] + +# content vector +content_vector_data = ["new_dataset_10m_movies"] +content_vector_schema = ["new_dataset_10m_movies_schema"] +content_vector_transformations = ["content_vector_trasnformation_1"] + +[recommenders] +user_recommenders = ["ALS"] +content_recommenders = ["CBWithKMeans"] + +[metrics] +metrics = ["RMSE", "MAE"] +``` + +When you run hermes with the above configuration, the following will happen: +* user_vector_transformation_1 will be applied to new_dataset_10m_ratings. +* user_vector_transformation_2 will be applied to new_dataset_20m_ratings. +* content_vector_transformation_1 will be applied to new_dataset_10m_movies. +* ALS will be applied to UserVector of new_dataset_10m_ratings. +* ALS will be applied to UserVector of new_dataset_20m_ratings. +* CBWithKMeans will be applied to ContentVector of new_dataset_10m_movies. +* RMSE will be applied to UserVector of new_dataset_10m_ratings after ALS has been subjected to it. +* RMSE will be applied to UserVector of new_dataset_20m_ratings after ALS has been subjected to it. +* RMSE will be applied to ContentVector of new_dataset_10m_ratings after CBWithKMeans has been subjected to it. +* MAE will be applied to UserVector of new_dataset_10m_ratings after ALS has been subjected to it. +* MAE will be applied to UserVector of new_dataset_20m_ratings after ALS has been subjected to it. +* MAE will be applied to ContentVector of new_dataset_10m_ratings after CBWithKMeans has been subjected to it. diff --git a/docs/framework.md b/docs/framework.md new file mode 100644 index 0000000..ee62f36 --- /dev/null +++ b/docs/framework.md @@ -0,0 +1,659 @@ +# Understanding Hermes's Framework + +* [Command Line Utilities](#command-line-utilities) +* [General Framework Concepts](#general-framework-concepts) +* [Main Components](#main-components) + * [hermesctl.py](#hermesctlpy) + * [Revising Hermes's Version Number](#revising-hermess-version-number) + * [Revising What to Log](#revising-what-to-log) + * [Understanding Spark Context](#understanding-spark-context) + * [Adding New Global Variables](#adding-new-global-variables) + * [Adding New States in State Machine](#adding-new-states-in-state-machine) + * [Adding New Variables in Cargo](#adding-new-variables-in-cargo) + * [Adding and Extracting New Configuration Variables](#adding-and-extracting-new-configuration-variables) + * [Adding New Configuration Variables](#adding-new-configuration-variables) + * [Extracting New Configuration Variables](#extracting-new-configuraiton-variables) + * [hermes.py](#hermespy) + * [Currently Defined States](#currently-defined-states) + * [start_state](#start_state) + * [json_to_rdd_state](#json_to_rdd_state) + * [split_data_state](#split_data_state) + * [make_prediction_state](#make_prediction_state) + * [calculate_metrics_state](#calculate_metrics_state) + * [error_state](#error_state) + * [Handling Multiple Next States](#handling-multiple-next-states) + * [Defining a New State](#defining-a-new-state) + * [hermesui.py](#hermesuipy) + * [Adding Additional UI](#adding-additional-ui) +* [Helper Components](#helper-components) + * [singleton.py](#singletonpy) + * [globals.py](#globalspy) + * [helper.py](#helperpy) + * [Adding New Global Helper Function](#adding-new-global-helper-functions) + * [cargo.py](#cargopy) + * [config.py](#configpy) + * [data.py](#datapy) + * [Adding New Vector Type](#adding-new-vector-type) + * [vectorgenerator.py](#vectorgeneratorpy) + * [Understanding What Vectors Are](#understanding-what-vectors-are) + * [Adding New Vector Type](#adding-new-vector-type-1) + * [Adding New Dataset](#adding-new-dataset) + * [Adding New Vector Transformation](#adding-new-vector-transformation) + * [recommendergenerator.py](#recommendergeneratorpy) + * [Adding New Recommender System Algorithms](#adding-new-recommender-system-algorithms) + * [Implementing a Different Use Case for a Specific Recommender System Algorithm](#implementing-a-different-use-case-for-a-specific-recommender-system-algorithm) + * [metricgenerator.py](#metricgeneratorpy) + * [Adding New Metric](#adding-new-metric) + * [statemachine.py](#statemachinepy) + * [timer.py](#timerpy) + +## Command Line Utilities + +Hermes uses Click as its command line utilities. To learn what parameters Hermes take for the command line, please read the guide [How to Run Hermes] +(https://github.com/lab41/hermes/tree/master/docs/run.md). + +## General Framework Concepts + +The goal of Hermes is to give user the ability to run multiple recommender system algorithms and metrics on a particular dataset to determine which recommender system works best for this dataset. For this reason, we want to make the framework as modular as possible so that user can implement his/her own recommender system algorithms or performance metrics as needed if they were not yet implemented by default, asssuming that the target user is a data scientist. + +Hermes relies on a state machine as its framework. The beauty of the state machine is that state machine allows modularity. Each state represents a particular functionality, and states do not have to follow a singular path. This means that each state has the option to go to multiple different states for its next state depending on the context it was placed. + +Currently, Hermes has 5 states defined; they are start_state, json_to_rdd_state, split_data_state, make_prediction_state, and calculate_metrics_state. These states make up a state machine that follows this particular path (which can be subjected to change): + +```bash +start_state -> json_to_rdd_state -> split_data_state -> make_prediction_state -> calculate_metrics_state +``` + +Details of what each state does is explained in [hermes.py](#hermespy). + +Reading this entire article will give you the complete understanding of what the framework does. But if you wanted a TL;DR version, please check out the following: +* If you do not know a particular term used in Hermes, please check out the glossary: + * [Glossary](https://github.com/Lab41/hermes/tree/master/docs/glossary.md) +* Understand the assumptions made + * [Assumptions](https://github.com/Lab41/hermes/tree/master/docs/assumptions.md) +* If you are planning to change the flow of the state machine, please read: + * [Adding New States in State Machine](#adding-new-states-in-state-machine) + * [Defining a New State](#defining-a-new-state) + * [Handling Multiple Next States](#handling-multiple-next-states) +* If you are planning to use your own dataset not yet supported by Hermes, please read: + * [Understanding What Vectors Are](#understanding-what-vectors-are) + * [Datasets Supported](https://github.com/Lab41/hermes/tree/master/docs/data_supported.md), in particular [Adding New Dataset](#adding-new-dataset). +* If you are planning to use your own recommender system algorithms not yet supported by Hermes, please read: + * [Recommender System Algorithms Supported](https://github.com/Lab41/hermes/tree/master/docs/recommenders_supported.md), in particular [Adding New Recommender System Algorithms](#adding-new-recommender-system-algorithms). +* If you are planning to use your own metrics not yet supported by Hermes, please read: + * [Metrics Supported](https://github.com/Lab41/hermes/tree/master/docs/metrics_supported.md), in particular [Adding New Metric](#adding-new-metric). + +## Main Components + +Hermes has three main components: hermesctl.py, hermes.py, and hermesui.py. +* hermesctl.py is the entry point; it also handles instantiation. +* hermes.py defines every state in the state machine. +* hermesui.py defines the command line UI used in hermes.py. + +### hermesctl.py + +**Path: hermes/hermes/hermesctl.py** + +When you run the hermes binary, it will call on the main() function found in hermesctl.py. + +hermesctl.py is responsible for +* printing Hermes's version number +* initializing global varibles +* instantiating state machines +* creating cargo used in state machines +* parsing the configuration files +* running state machine + +#### Revising Hermes's Version Number + +You can check Hermes's version number by running +```bash +$ hermes --version +``` + +Team members revise the version number found in `hermes/hermes/__init__.py.` + +#### Revising What to Log + +We employ the logging library to log INFO, DEBUG, and ERROR messages. The logger is a global variable with the name "hermes". + +All INFO messages are outputted to the command line. + +ALL DEBUG messages are outputted to the command line and a log file called hermes.log. hermes.log is created whenever the hermes binary is run. Debug messages will only print when the --verbose option is passed. + +ALL ERROR messages are outputted to the command line and stderr. + +#### Understanding Spark Context + +Spark Context will not be instantiated if you run the framework in an iPython notebook (TODO: in development). + +Otherwise, it is wrapped in a singleton pattern to avoid multiple instantiation with the app name of "hermes". The singleton wrapper is defined in [singleton.py](#singletonpy) + +#### Adding New Global Variables + +Global variables are defined in [globals.py](#globalspy) and instantiated in hermesctl's main(). + +To add a new global variable, please define it in the Globals class in [globals.py](#globalspy). + +A list of what global variables are currently defined can be found in [globals.py](#globalspy). + +#### Adding New States in State Machine + +You can add a new state to Hermes in hermesctl's add_states() function, but you need to define what the state does (including where it needs to go next) in [hermes.py](#hermespy). If the new state is an end state, meaning there is no other state to go to next, you have to specify that it is an end state. + +To add a state, add the following line in hermesctl's add_states(): +```bash +state_machine.add_state(hermes.new_state) +``` + +To add an end state, add the following line in hermesctl's add_states(): +```bash +state_machine.add_state(hermes.new_state, isEndState=True) +``` + +Make sure you define your state as well; otherwise, the framework will output an error. Please follow the instructions in [Defining a New State](#defining-a-new-state). + +#### Adding New Variables in Cargo + +Cargo is the object passed around in the state machine. Since we can never know until runtime where each state has derived from and where it will go next, we do not know what parameters to pass into each state. Cargo encapsulates all the parameters needed for each state in one object. It is defined in [cargo.py](#cargopy) and instantiated in hermesctl's main(). Future implementation will clean up Cargo so that one state does not know what another state's parameter needs are unless necessary (TODO: in development). + +To add a new variable in cargo for use in your newly defined state, please define it in the constructor of the Cargo class in [cargo.py](#cargopy). + +A list of what variables are currently defined in cargo can be found in [cargo.py](#cargopy). + +#### Adding and Extracting New Configuration Variables + +Configuration Files are currently extracted via the ConfigParser library. In the future, we might use ConfigObj as it supports subsections, which ConfigParser does not support (TODO: in development). + +Listed below are recognized sections and their respective items: +* datasets + * dataname + * user_vector_data + * user_vector_transformations + * user_vector_schemas + * content_vector_data + * content_vector_transformations + * content_vector_schemas +* recommenders + * recommenders +* metrics + * metrics + +What Hermes will do when it encounters unrecognized section or section's item: +* If it does not recognize the section, it will skip the entire section. +* In datasets section, if dataname is not specified, it will quit the program. +* In datasets section, if User Vector (user_vector_data, user_vector_transformation) or Content Vector (content_vector_data, content_vector_transformation) or both are not specified, it will quit the program. In the future, it will also quit the program if it does not have User Vector and Content Vector specified when Content Vector is already specified (TODO: in development). +* Any other items in datasets that are not recognized are treated as a support_file item, meaning the variable is placed as a key and its value is placed as a value in a dictionary called support_files to be used later when generating the vector. +* In recommenders section, any items that are not recognized will be skipped. In the future, extra parameter variables needed for recommender system algorithms will be recognized (TODO: in development). +* In metrics section, any items that are not recognized will be skipped. In the future, extra parameter variables needed for calculating the metrics will be recognized (TODO: in development). + +Note that in datasets section, if user_vector_data and user_vector_transformations are defined in the configuration file, hermesctl.py will store these values inside a UserVector Data object. Similarly, if content_vector_data and content_vector_transformations are defined in the configuration file, hermesctl.py will store these values inside a ContentVector Data object. All Data objects are then placed inside Cargo's data list. + +##### Adding New Configuration Variables + +Add any [new_section] in the configuration file. Add any new section's items underneath the [new_section] in the configuration file as needed. + +##### Extracting New Configuration Variables + +To make your new section and its items recognizable, add them in [configs.py](#configspy)'s HEADINGS variable. + +Handle its extractions in hermesctl's extract_configs() function. For handling the pecularities of the section, follow the example of the datasets section. For handling the recognized and unrecognized section items, handle it in extract_configs()'s helper function handle_recognized_section_item() and handle_unrecognized_section_item() respectively. + +### hermes.py + +**Path: hermes/hermes/hermes.py** + +hermes.py defines all functions for all states in the state machine. + +#### Currently Defined States + +##### start_state + +start_state creates the HDFS directory specified by the user (if the user does not specify it, the default is datasets) and loads all JSON files into this HDFS directory. + +##### json_to_rdd_state + +json_to_rdd_state converts the JSON file into its respective RDD or Vectors. + +##### split_data_state + +split_data_state splits the data in Vector into train data, test data, and validation data depending on the input given by the user at runtime. + +##### make_prediction_state + +make_prediction_state takes the train data from each Vector, develop model based on the train data and the recommender in configuration file, and make prediction based on this model. + +##### calculate_metrics_state + +calculate_metrics test the metrics specified in the configuration file. This is an end state. + +##### error_state + +error_state is where states go when they encounter an error. This is an end state. + +#### Handling Multiple Next States + +If you wanted a state to go to multiple next states, define the switch in the state of interest and make sure you return newState and cargo with the correct next state (name of the state function) and necessary parameters initialized or added to cargo. + +#### Defining a New State + +Defining a new state is the same as defining a function in hermes.py. Make sure you add the new state into the state machine by following the instructions in [Adding New States in State Machine](#adding-new-states-in-state-machine). + +### hermesui.py + +**Path: hermes/hermes/hermesui.py** + +hermesui.py defines all the command line user interface used in hermes.py. + +#### Adding Additonal UI + +Most configuration can be addressed using the configuration file. However, if you needed to ask the user for a configuration at runtime, define the UI function in hermes.py and call it as needed in the required state. + +## Helper Components + +### singleton.py + +**Path: hermes/modules/singleton.py** + +SCSingleton is a singleton pattern object that wraps the Spark Context to avoid multiple instantiation of the Spark Context. + +### globals.py + +**Path: hermes/modules/globals.py** + +Listed below are the currently defined global variables: +* verbose: a boolean variable that prints out debug log messages +* logger: logging object that logs messages +* scsingleton: singleton object that defines the Spark Context + +To add a new global variable, please see [Adding New Global Variables](#adding-new-global-variables). + +### helper.py + +**Path: hermes/modules/helper.py** + +helper.py defines all global helper functions used in multiple places throughout the framework. + +#### Adding New Global Helper Function + +To add a new global helper function, create the function in helper.py and import helper.py to the necessary file. + +### cargo.py + +**Path: hermes/modules/cargo.py** + +Cargo is the object passed around in the state machine. Since we can never know until runtime where each state has derived from and where it will go next, we do not know what parameters to pass into each state. Cargo encapsulates all the parameters needed for each state in one object. + +Listed below are the currently defined cargo variables: +* hdfs_dir: Name of HDFS directory to store input data. One of the option passed in when running hermes binary. Default = datasets. +* fs_default_ip_addr: IP address of fs.default.name used in HDFS. One of the arguments passed in when running hermes binary. Default = localhost:9000. +* datas: List of Data objects initialized when extracting the configuration file. +* vectors: List of Vector objects initialized during one of the states in the state machine, json_to_rdd_state. +* support_files: Unrecognized items in [datasets] section of the configuration file that is presumed to be support files for the creation of a Vector. +* recommenders: List of recommender system algorithms initialized when extracting the configuration file. +* metrics: List of metrics initialized when extracting the configuration file. +* error_msg: It starts out as an empty string that will be initialized as an error message to the error state. + +To add a new variable in cargo, please see [Adding New Variables in Cargo](#adding-new-variables-in-cargo). + +### config.py + +**Path: hermes/modules/config.py** + +config.py has a list of recognized section and section's items used in the parsing of the configuration file. It also has functions defined to assist in the parsing of the configuration file. + +### data.py + +**Path: hermes/modules/data.py** + +Class Data is defined in data.py to store the configurations specified in the configuration file. We have not decided whether or not this is the best way to store configurations from the configuration file. (TODO: in development) + +Currently, it has a subclass called UserVectorData nad ContentVectorData to differentiate the two different Vector Types that Hermes supports. + +#### Adding New Vector Type + +Hermes has two vector types: UserVector and ContentVector. If you wanted to add a new vector type, you will need to follow the instructions in [Adding New Vector Type](#adding-new-vector-type-1) under the vectorgenerator.py as well as add its respective Data object for storing its configuration. + +### vectorgenerator.py + +**Path: hermes/modules/vectorgenerator.py** + +#### Understanding What Vectors Are + +In Hermes, when we referenced a vector, it refers to a dataframe that has been converted to a RDD after a transformation occurs. This transformation is specified by the vector transformation. For example, if you have Movie Lens data and you wanted to build a user vector from this data, if you specified the vector transformation to be "ratings" in the configuration file, the data from the JSON file is transformed into a dataframe and then a RDD of [(user_id, item_id, rating)]. In other words, the output of this transformation is a vector of [(user_id, item_id, rating)]. + +There are two types of vectors: User Vector and Content Vector. User Vector refers to the vector describing users in the data. Content Vector refers to the vector describing content in the data. Collaborative Filtering Recommender System typically uses only User Vector, and Content Based Recommender System typically uses both User Vector and Content Vector, but this does not have to be the case. + +Every vector type inherits from the Vector class, meaning all User Vector and Content Vector will have the following variables: +* data: a Data object containing the configuration for this particular vector from the configuration file +* support_files: list of unrecognized variables in [datasets] section of the configuration file that we assume is a support file for the creation of a Vector +* vector_transformation: transformation needed to convert data from a JSON file to a specified vector +* training_vector: part of the vector that is split for training +* test_vector: part of the vector that is split for test +* validation_vector: part of the vector that is split for validation +* prediction_vector: part of the vector that is predicted based on test_vector and the model that is created from training_vector + +Since each data requires its own specific vector transformation, every data has its own class as well as its own UserVector and ContentVector. The data's UserVector and ContentVector inherit from both the data's own class as well as UserVector or ContentVector respectively. The data's UserVector and ContentVector have functions defined in their class to execute vector transformation. The name of these functions has to match the name of the vector transformation passed in via the configuration file in order for the vector transformation to occur. + +Dataname is a variable used in configuration file to refer to the data where each JSON file is coming from. The data's own class has a check function called isSameDataInstance() to verify that the dataname passed in via the configuration file is describing about the same data as data's own class. + +To automatically create a vector (ie. which vector type and from which data), VectorFactory is there to the rescue! It can either return a Vector object or the RDD / vector itself by calling VectorFactory().create_obj_vector(...) or VectorFactory().create_vector(...) respectively. + +#### Adding New Vector Type + +UserVector and ContentVector are two vector types supported in Hermes. If you wanted to add a new vector type, create a class for your new vector type that inherits the Vector class. Add additional variables and functions as needed to the class. + +```bash +class MyNewVectorType(Vector): + pass +``` + +#### Adding New Dataset + +Same explanation can be found in [Datasets Supported's section on Adding New Datasets](https://github.com/Lab41/hermes/tree/master/docs/data_supported.md#adding-new-datasets). + +Every time you add a new dataset, you will need to create a new file in hermes/hermes/modules/vectors. The template for supporting an additional dataset is shown below. + +Template: + +```bash +from hermes.modules.vectorgenerator import UserVector, ContentVector + +class NewDataset(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.dataname == "new_dataset_dataname_name" + +class NewDatasetUserVector(UserVector, NewDataset): + def user_vector_transformation_1(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) + + def user_vector_transformation_2(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)).filter(lambda (u, m, r): r > 3) + + def user_vector_transformation_n(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, -1 if row.rating < 3 else 1)) + +class NewDatasetContentVector(ContentVector, NewDataset): + def content_vector_transformation_1(self): + def internal_helper_function(row): + return np.array(( + int(row.genre_action), + int(row.genre_adventure), + int(row.genre_animation), + )) + return self.data.dataframe.map(lambda row: (row.movie_id, internal_helper_function(row))) + +``` + +1. Instantiate a class for your dataset. In this case, it is specified as class NewDataset. +2. Instantiate a User Vector and a Content Vector class for your dataset that inherits from your dataset class and UserVector or Content Vector respectively. In this case, the UserVector for NewDataset is called NewDataSetUserVector, and the ContentVector for NewDataset is called NewDataContentVector. +3. Provide the dataname name for the check in isSameDataInstance(). In this case, dataname is checked if it's equal to "new_dataset_dataname_name". +4. Provide the vector transformation logic for each type of vectors. For User Vector transformations, define the function in the class NewDatasetUserVector. In this case, these vector transformations are user_vector_transformation_1, user_vector_transformation_2, and user_vector_transformation_n. For Content Vector transformations, define the function in the class NewDatasetContentVector. In this case, the vector transformation is content_vector_trasnformation_1. +5. Additional support files needed for the vector transformation is passed down from the configuration file as self.support_files. self.support_files is a dictionary with the key as a variable and the value as the value received in the configuration file. Please read on the [configuration file guide](https://github.com/Lab41/hermes/tree/master/docs/configs.md#optional-variables) for more details. + +After you have defined the concrete implementation of the new dataset, you can now use the dataset and apply multiple recommender system algorithms and metrics. + +In list_of_files.ini: +```bash +[new_dataset_dataname_name] +new_dataset_10m_ratings = /path/to/your/new/dataset/10m/ratings.json.gz +new_dataset_20m_ratings = /path/to/your/new/dataset/20m/ratings.json.gz +new_dataset_10m_ratings_schema = /path/to/your/new/dataset/10m/ratings_schema.json.gz +new_dataset_20m_ratings_schema = /path/to/your/new/dataset/20m/ratings_schema.json.gz + +new_dataset_10m_movies = /path/to/your/new/dataset/10m/movies.json.gz +new_dataset_10m_movies_schema = /path/to/your/new/dataset/10m/movies_schema.json.gz +``` + +In new_dataset_config.ini: +```bash +[datasets] +dataname = new_dataset_dataname_name + +# user vector +user_vector_data = ["new_dataset_10m_ratings", "new_dataset_20m_ratings"] +user_vector_schemas = ["new_dataset_10m_ratings_schema", "new_dataset_20m_ratings_schema"] +user_vector_transformations = ["user_vector_transformation_1", "user_vector_transformation_2"] + +# content vector +content_vector_data = ["new_dataset_10m_movies"] +content_vector_schema = ["new_dataset_10m_movies_schema"] +content_vector_transformations = ["content_vector_trasnformation_1"] + +[recommenders] +user_recommenders = ["ALS"] +content_recommenders = ["CBWithKMeans"] + +[metrics] +metrics = ["RMSE", "MAE"] +``` + +When you run hermes with the above configuration, the following will happen: +* user_vector_transformation_1 will be applied to new_dataset_10m_ratings. +* user_vector_transformation_2 will be applied to new_dataset_20m_ratings. +* content_vector_transformation_1 will be applied to new_dataset_10m_movies. +* ALS will be applied to UserVector of new_dataset_10m_ratings. +* ALS will be applied to UserVector of new_dataset_20m_ratings. +* CBWithKMeans will be applied to ContentVector of new_dataset_10m_movies. +* RMSE will be applied to UserVector of new_dataset_10m_ratings after ALS has been subjected to it. +* RMSE will be applied to UserVector of new_dataset_20m_ratings after ALS has been subjected to it. +* RMSE will be applied to ContentVector of new_dataset_10m_ratings after CBWithKMeans has been subjected to it. +* MAE will be applied to UserVector of new_dataset_10m_ratings after ALS has been subjected to it. +* MAE will be applied to UserVector of new_dataset_20m_ratings after ALS has been subjected to it. +* MAE will be applied to ContentVector of new_dataset_10m_ratings after CBWithKMeans has been subjected to it. + +#### Adding New Vector Transformation + +To add a new vector transformation, go to the data class itself and decide which vector type it is. Under the class of the vector type, define the new vector transformation as a class function. + +For example: if you wanted to create a vector transformation for MovieLens data's UserVector, do the following: +```bash +class MovieLens(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.dataname == "movielens" + +class MovieLensUserVector(UserVector, MovieLens): + def ratings(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) + + def new_vector_transformation: + # your defined vector transformation + ... + return vector_after_the_transformation + +``` +Except instead of naming the new function as new_vector_trasnformation, name it according to what you want to use in the configuration file. + +### recommendergenerator.py + +**Path: hermes/modules/recommendergenerator.py** + +recommendergenerator.py is built to allow clearer execution of code using the bridge pattern. For example, let's try to create a model using ALS. To do so, we can execute the following: + +```bash +import recommendergenerator as rg + +recommender = rg.ALS(vector) +``` + +We can then make a prediction because the Recommender object already knows about the recommender system algorithm to use as well as the training and test data from the parameter vector that we passed in. + +```bash +prediction_vector = recommender.make_prediction() +``` + +If we have a specific use case that is different than the normal ALS use case, we can define that abnormal use case for ALS and call it as follows: + +```bash +abnormal_usecase = AbnormalUseCase() +recommender = ALS(abnormal_usecase) +prediction_vector = recommender.make_prediction() +``` + +We do not have to call the make_prediction() function differently. We just call make_prediction() because it will call make_prediction() specifically for the abnormal use case. + +Also, when you change the recommender system algorithm, say for example CBWithKMeans, all you need to do is create this Recommender object and when you are ready to make your prediction, call make_prediction() because it will make sure that behind the scene, it will call CBWithKMeans's make_prediction(). + +```bash +recommender = CBWithKMeans() +prediction_vector = recommender.make_prediction() +``` + +#### Adding New Recommender System Algorithms + +Same explanation can be found in [Recommenders Supported's section on Adding New Recommender System Algorithms](https://github.com/Lab41/hermes/tree/master/docs/recommenders_supported.md#adding-new-recommender-system-algorithms). + +To add a new recommender system algorithm, instantiate a class that inherits from Recommender class and defines the make_prediction() function that calls on the recommender system algorithm's own make prediction function. + +```bash +class NewRecommenderSystemAlgorithm(Recommender): + def make_prediction(self) + return self.implementation.make_prediction_with_new_recommender_system_algorithm(self.vector) +``` + +self.implementation is the use case that you want to use. The default use case is the Normal class. If you have another use case, for example: an abnormal use case, you want to instantiate a class called Abnormal, for example, that inherits from ImplementationInterface. + +So let's do that, let's define an abnormal use case. +```bash +class Abnormal(ImplementationInterface): + pass +``` + +Let's say we want to define the make_prediction() function for both normal and abnormal use case. Therefore, the first thing we need to do is define the make_prediction() function for our new recommender system algorithm in the ImplementationInterface so that in case there is another use case that does not implement our new recommender system algorithm's make_prediction() function, it will fail by raising a NotImplemented error. + +```bash +class ImplementationInterface(object): + def make_prediciton_with_als(self): + raise NotImplemented + + def make_prediction_with_cbwithkmeans(self): + raise NotImplemented + + def make_prediction_with_new_recommender_system_algorithm(self): + raise NotImplemented +``` + +After you defined in the ImplementationInterface class, you also want to define it in Normal class. + +```bash +class Normal(ImplementationInterface): + def make_prediction_with_als(self): + ... + return prediciton_vector + + def make_prediction_with_cbwithkmeans(self): + ... + return prediction_vector + + def make_prediction_with_new_recommender_system_algorithm(self): + # implement your make_prediction() for the normal use case + return prediciton_vector +``` + +Now begin implementing it in your Abnormal class too. +```bash +class Abnormal(ImplementationInterface): + def make_prediction_with_new_recommender_system_algorithm(self): + # implement your make_prediction() for the abnormal use case + return prediction_vector +``` + +You are done. :) + +#### Implementing a Different Use Case for a Specific Recommender System Algorithm + +Let's do this with the ALS recommender system algorithm. We want to create an abnormal use case. To do so, we need to instantiate the Abnormal class that inherits from ImplementationInterface. + +```bash +class Abnormal(ImplementationInterface): + pass + +``` + +Since ALS's make_prediction() function is already defined in the normal use case, we just need to define it also in the abnormal use case with the abnormal use case's implementation. + +```bash +class Abnormal(ImplementationInterface): + def make_prediction_with_new_recommender_system_algorithm(self): + # implement your make_prediction() for the abnormal use case + return prediction_vector +``` + +You are done. :) + +### metricgenerator.py + +**Path: hermes/modules/metricgenerator.py** + +metricgenerator.py is also built to allow clearer execution of code using the strategy pattern. You have MetricExceutor that executes different types of metrics and change the metrics during runtime. + +For example: we want to execute RMSE and then execute PRFS with different vectors. + +```bash +exeggutor = MetricExecutor(RMSE()) +print exeggutor.execute(vector1) +print exeggutor.execute(vector2) +exeggutor = MetricExecutor(PRFS) +print exeggutor.execute(vector1) +print exeggutor.execute(vector2) +``` + +MetricFactory() is a class that will automatically instantiate which metric dependent on what is specified in the configuration file. + +#### Adding New Metric + +Same explanation can be found in [Metrics Supported's section on Adding New Metric](https://github.com/Lab41/hermes/tree/master/docs/metrics_supported.md#adding-new-metric). + +To add a new metric, create a class that inherits from the Metric class and define a calculate_metric function in the class. + +```bash +class MyCoolNewMetric(Metric): + def calculate_metric(self, vector): + # calculate your cool new metric here + # or + # define your cool new metric in hermes/metrics/performance_metrics.py + return metrics.performance_metrics.calculate_my_cool_new_metric(vector.test_vector, vector.prediction_vector) +``` + +### statemachine.py + +**Path: hermes/modules/statemachine.py** + +statemachine.py defines the concrete implementation of the state machine. + +Here is how you can use a state machine: +```bash +# state1 -> state2 -> state3a +# -> state3b +# where state1, state2, state3a, and state3b are defined functions. + +import StateMachine + +sm = StateMachine() +sm.add_state(state1) +sm.add_state(state2) +sm.add_state(state3a, isEndState=True) +sm.add_state(state3b, isEndState=True) +sm.set_start(state1) +sm.run() + +# or if you have cargo defined, instead of sm.run(), you can do the following: +# sm.run(Cargo()) +``` + +### timer.py + +**Path: hermes/modules/timer.py** + +timer.py defines a Timer Class where you can use anywhere in the code to time how long a particular function runs. + +For example: if you wanted to time how long somefunction() runs, do the following: +```bash +import Timer + +with Timer() as t: + somefunction() +print("somefunction() takes %s seconds or %s milliseconds" % (t.secs, t.msecs)) +``` diff --git a/docs/glossary.md b/docs/glossary.md new file mode 100644 index 0000000..8af7921 --- /dev/null +++ b/docs/glossary.md @@ -0,0 +1,86 @@ +# Glossary + +This is a glossary of common terms used in Hermes and their specified meaning. + +## A + +## B + +## C +**Cargo**: Cargo is the object passed around in the state machine. Since we can never know until runtime where each state has derived from and where it will go next, we do not know what parameters to pass into each state. Cargo encapsulates all the parameters needed for each state in one object. It is defined in cargo.py and instantiated in hermesctl's main(). + +**Content Vector**: Content Vector refers to the vector describing the content in the data. + +## D +**Dataframe**: A DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. (Excerpt taken from Spark's SQL Programming Guide). In Hermes, the dataframe variable defined in the Data class refers to the dataframe created after reading in the JSON file. + +**Dataname**: Dataname is a variable used in configuration file to refer to the data where each JSON file is coming from. + +## E + +## F + +## G + +## H + +## I + +## J + +## K + +##L + +## M + +**Metrics**: See "Performance Metrics". + +## N + +## O + +## P + +**Performance Metrics**: Performance Metrics allows user to evaluate a recommender and how much a recommender adds value to the end user. + +## Q + +## R +**RDD**: Resilient Distributed Dataset or RDD is the basic abstraction in Spark that represents an immutable, partitioned collection of elements that can be operated on in parallel. (Excerpt taken from Spark's man page about RDD). + +**Recommender**: See "Recommender System Algorithms". + +**Recommender System Algorithms**: Hermes use Recommender System Algorithms to build a model based on the train data and make a prediction based on the test data. + +## S + +## T +**Tradespace**: It is the space spanned by completely enumerated design variables, which means given a set of design variables, the tradespace is the space of possible design options. (Excerpt taken from Adam M. Ross & Daniel E. Hasting's "The Tradespace Exploration Program") + +**Test Data**: Data is usually split into train data, test data, and validation data. After you have used the train data to build a model and validation data to select the best performing model out of all the models, you use test data to estimate the accuracy of the selected approach. In other words, you want to estimate how well your model has been trained. + +**Train Data**: Data is usually split into train data, test data, and validation data. Train data is used by a recommender to build a model by pairing the input with the expected output. + + +## U +**User Vector**: User Vector refers to the vector describing users in the data. + +## V +**Validation Data**: Data is usually split into train data, test data, and validation data. Validation data is used to select which is the best performing model out of all the models you trained with the train data. Sometimes validation data is optional. + +**Vector**: In Hermes, when we referenced a vector, it refers to a dataframe that has been converted to a RDD after a transformation occurs. This transformation is specified by the vector transformation. For example, in the case of a user vector, if the vector transformation is "ratings" for Movie Lens data, the data from the JSON file is transformed into a RDD of [(user_id, item_id, rating)]. The output of this transformation is a vector of [(user_id, item_id, rating)]. + +**Vector Transformation**: In Hermes, vector transformation refers to the transformation needed to convert data from a JSON file to a specified vector. Please see **Vector** for more details. + +**Vector Type**: Hermes separates vectors into two distinct types: User Vector and Content Vector. User Vector refers to the vector describing users in the data. Content Vector refers to the vector describing content in the data. Users can implement other vector types as needed if User Vector and Content Vector does not describe the vector they are building. + +**Vectorizer**: see "Dataname". This is a deprecated name used before we decided to stick with "Dataname". + +## W + +## X + +## Y + +## Z \ No newline at end of file diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..7769da9 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,89 @@ +# Hermes Installation Guide + +## Dependencies: +* Spark 1.5.1 +* Scala 2.11.7 +* Pyspark 0.8.2.1 +* Hadoop 2.7.1 +* virtualenv + +## How to Install Dependencies on Mac OS X: +### Installing Spark, Scala, and PySpark +1. Install Java + 1. Download + 2. Double click on .dmg file to install. + 3. In a terminal, type java -version. You should see the following: + ```bash + java version "1.8.0_65" + Java(TM) SE Runtime Environment (build 1.8.0_65-b17) + Java HotSpot(TM) 64-Bit Server VM (build 25.65-b01, mixed mode) + ``` +2. Set JAVA_HOME + ```bash + export JAVA_HOME=$(/usr/libexec/java_home) + ``` + +3. Install Homebrew + ```bash + $ ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" + ``` + +4. Install Scala + ``` + $ brew install scala + ``` + +5. Download Spark from https://spark.apache.org/downloads.html. + +6. Set SCALA_HOME and SPARK_HOME and export it to path in your .bash_profile. + ```bash + export SPARK_HOME=/path/to/your/spark + export PATH=$PATH:$SPARK_HOME/bin + export SCALA_HOME=/path/to/your/scala + export PATH=$PATH:$SCALA_HOME/bin + ``` + +7. Export PySpark classes to the Python path after you have installed Python. + ```bash + export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH + ``` + +8. Build and install Apache Spark + ```bash + $ brew install sbt + $ cd $SPARK_HOME + $ sbt/sbt clean assembly + ``` + +### Installing Hadoop +Please follow this [guide](http://zhongyaonan.com/hadoop-tutorial/setting-up-hadoop-2-6-on-mac-osx-yosemite.html). + +### Installing virtualenv +Please read this [guide](http://docs.python-guide.org/en/latest/dev/virtualenvs/) for more details. +```bash +$ pip install virtualenv +``` + +## How to Install Hermes: + +(Optional) After you have installed the dependencies, if you have different projects that require different Python environment, you can use a Virtual Environment. As listed in the Virtual Environment's [site](http://docs.python-guide.org/en/latest/dev/virtualenvs/), "a Virtual Environment is a tool to keep the dependencies required by different projects in separate places, by creating virtual Python environments for them." + +```bash +$ virtualenv name_of_your_virtualenv +$ . name_of_your_virtualenv/bin/activate +``` + +To install Hermes, run +```bash +$ python setup.py install +``` + +This will create a binary called hermes in /usr/local/bin/hermes. Instead of running the binary with the entire path (ie. ./usr/local/bin/hermes), you can install it so that you can run hermes automatically on the command line. +```bash +$ pip install --editable . +``` + +Now, you can just run hermes the binary and it will prompt you with what you want to do with the data that you have. +```bash +$ hermes +``` diff --git a/docs/metrics_supported.md b/docs/metrics_supported.md new file mode 100644 index 0000000..e7d1cc9 --- /dev/null +++ b/docs/metrics_supported.md @@ -0,0 +1,51 @@ +# Metrics Supported + +* [RMSE](#rmse) +* [MAE](#mae) +* [PRFS](#prfs) +* [Adding New Metric](#adding-new-metric) + +This excerpt is taken out from [Understanding Hermes's Framework](https://github.com/Lab41/hermes/tree/master/docs/framework.md#metricgeneratorpy). It will be helpful if you read this guide first. + +metricgenerator.py is also built to allow clearer execution of code using the strategy pattern. You have MetricExceutor that executes different types of metrics and change the metrics during runtime. + +For example: we want to execute RMSE and then execute PRFS with different vectors. + +```bash +exeggutor = MetricExecutor(RMSE()) +print exeggutor.execute(vector1) +print exeggutor.execute(vector2) +exeggutor = MetricExecutor(PRFS) +print exeggutor.execute(vector1) +print exeggutor.execute(vector2) +``` + +MetricFactory() is a class that will automatically instantiate which metric dependent on what is specified in the configuration file. + +## RMSE + +Explanation of what RMSE does will be provided in the future. (TODO) + +## MAE + +Explanation of what MAE does will be provided in the future. (TODO) + +## PRFS + +Explanation of what PRFS does will be provided in the future. (TODO) + + +#### Adding New Metric + +This excerpt is taken out from [Understanding Hermes's Framework](https://github.com/Lab41/hermes/tree/master/docs/framework.md#adding-new-metric). + +To add a new metric, create a class that inherits from the Metric class and define a calculate_metric function in the class. + +```bash +class MyCoolNewMetric(Metric): + def calculate_metric(self, vector): + # calculate your cool new metric here + # or + # define your cool new metric in hermes/metrics/performance_metrics.py + return metrics.performance_metrics.calculate_my_cool_new_metric(vector.test_vector, vector.prediction_vector) +``` \ No newline at end of file diff --git a/docs/recommenders_supported.md b/docs/recommenders_supported.md new file mode 100644 index 0000000..6eba87b --- /dev/null +++ b/docs/recommenders_supported.md @@ -0,0 +1,117 @@ +# Recommender Algorithms Supported + +* [ALS](#als) + * [Use Cases Supported](#use-cases-supported) +* [Content Base with K-Means](#content-base-with-k-means) + * [Use Cases Supported](#use-cases-supported-1) +* [Adding New Recommender System Algorithms](#adding-new-recommender-system-algorithms) + +This excerpt is taken out from [Understanding Hermes's Framework](https://github.com/Lab41/hermes/tree/master/docs/framework.md#recommendergeneratorpy). It will be helpful if you read this guide first. + +recommendergenerator.py is built to allow clearer execution of code using the bridge pattern. For example, let's try to create a model using ALS. To do so, we can execute the following: + +```bash +import recommendergenerator as rg + +recommender = rg.ALS(vector) +``` + +We can then make a prediction because the Recommender object already knows about the recommender system algorithm to use as well as the training and test data from the parameter vector that we passed in. + +```bash +prediction_vector = recommender.make_prediction() +``` + +If we have a specific use case that is different than the normal ALS use case, we can define that abnormal use case for ALS and call it as follows: + +```bash +abnormal_usecase = AbnormalUseCase() +recommender = ALS(abnormal_usecase) +prediction_vector = recommender.make_prediction() +``` + +We do not have to call the make_prediction() function differently. We just call make_prediction() because it will call make_prediction() specifically for the abnormal use case. + +Also, when you change the recommender system algorithm, say for example CBWithKMeans, all you need to do is create this Recommender object and when you are ready to make your prediction, call make_prediction() because it will make sure that behind the scene, it will call CBWithKMeans's make_prediction(). + +```bash +recommender = CBWithKMeans() +prediction_vector = recommender.make_prediction() +``` + +## ALS + +Explanation of what ALS does will be provided in the future. (TODO) + +### Use Cases Supported + +* Normal Use Case + +## Content Base with K-Means + +Explanation of what Content Base with K-Means will be provided in the future. (TODO) + +### Use Cases Supported + +* Normal Use Case + +#### Adding New Recommender System Algorithms + +This excerpt is taken out from [Understanding Hermes's Framework](https://github.com/Lab41/hermes/tree/master/docs/framework.md#adding-new-recommender-system-algorithms). + +To add a new recommender system algorithm, instantiate a class that inherits from Recommender class and defines the make_prediction() function that calls on the recommender system algorithm's own make prediction function. + +```bash +class NewRecommenderSystemAlgorithm(Recommender): + def make_prediction(self) + return self.implementation.make_prediction_with_new_recommender_system_algorithm(self.vector) +``` + +self.implementation is the use case that you want to use. The default use case is the Normal class. If you have another use case, for example: an abnormal use case, you want to instantiate a class called Abnormal, for example, that inherits from ImplementationInterface. + +So let's do that, let's define an abnormal use case. +```bash +class Abnormal(ImplementationInterface): + pass +``` + +Let's say we want to define the make_prediction() function for both normal and abnormal use case. Therefore, the first thing we need to do is define the make_prediction() function for our new recommender system algorithm in the ImplementationInterface so that in case there is another use case that does not implement our new recommender system algorithm's make_prediction() function, it will fail by raising a NotImplemented error. + +```bash +class ImplementationInterface(object): + def make_prediciton_with_als(self): + raise NotImplemented + + def make_prediction_with_cbwithkmeans(self): + raise NotImplemented + + def make_prediction_with_new_recommender_system_algorithm(self): + raise NotImplemented +``` + +After you defined in the ImplementationInterface class, you also want to define it in Normal class. + +```bash +class Normal(ImplementationInterface): + def make_prediction_with_als(self): + ... + return prediciton_vector + + def make_prediction_with_cbwithkmeans(self): + ... + return prediction_vector + + def make_prediction_with_new_recommender_system_algorithm(self): + # implement your make_prediction() for the normal use case + return prediciton_vector +``` + +Now begin implementing it in your Abnormal class too. +```bash +class Abnormal(ImplementationInterface): + def make_prediction_with_new_recommender_system_algorithm(self): + # implement your make_prediction() for the abnormal use case + return prediction_vector +``` + +You are done. :) \ No newline at end of file diff --git a/docs/run.md b/docs/run.md new file mode 100644 index 0000000..8d2adf0 --- /dev/null +++ b/docs/run.md @@ -0,0 +1,49 @@ +# How to Run Hermes + +Hermes requires at least three arguments in order to run properly. +* fs_default_ip_addr: IP address of fs.default.name used in HDFS, ie. localhost:9000. +* list_of_files_config: A configuration file that lists all the json paths referenced by configs. +* configs: Users can provide an unlimited amount of configuration files that list what datasets to use and which recommender algorithms and metrics to apply to each dataset. + +For more details about list_of_files_config and configs, please read the [Configuration Files Guide](https://github.com/Lab41/hermes/tree/master/docs/configs.md). + +With one configuration file: +```bash +$ hermes localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini +``` + +With more than one configuration files: +```bash +$ hermes localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini ./hermes/configs/config2.ini +``` + +## Options + +The hermes binary can take in multiple options: +* --version +* --verbose +* --hdfs_dir + +### --version +--version displays the current hermes binary version number. The binary version number is located in hermes/hermes/__init__.py under the variable __version__. + +```bash +$ hermes --version +``` + +### --verbose +--verbose will print out all debug messages to help you debug the code. + +```bash +$ hermes --verbose localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini +``` + +### --hdfs_dir +--hdfs_dir requires you to pass in the name of the HDFS directory to store the input data given in the configuration files. The default name is set as "datasets". + +```bash +$ hermes --hdfs_dir datasets localhost:9000 ./hermes/configs/list_of_files.ini ./hermes/configs/config1.ini +``` + + + diff --git a/docs/using_notebook.md b/docs/using_notebook.md new file mode 100644 index 0000000..f71841e --- /dev/null +++ b/docs/using_notebook.md @@ -0,0 +1,105 @@ +# How to use iPython Notebook + +1. Install Anaconda +2. Launch Anaconda launcher +3. Launch ipython-notebook +4. Create an iPython profile for use with PySpark + 1. Make sure you have exported PySpark classes to your python path and build Apache Spark. + To export PySpark classes, add the following to your ~/.bash_profile: + + ```bash + # export spark to path + export SPARK_HOME=/path/to/your/spark + export PATH=$PATH:$SPARK_HOME/bin + # export pyspark classes to the python path + export PYTHONPATH=$SPARK_HOME/python:$PYTHONPATH + # export py4j to the python path + export PYTHONPATH=$SPARK_HOME/python/lib/py4j--src.zip:$PYTHONPATH + ``` + + 2. Build Apache Spark + + ```bash + $ cd $SPARK_HOME + $ sbt/sbt clean assembly + ``` + + ```bash +ipython profile create pyspark + ``` +5. Create a iPython notebook configuration + + ```bash +vim ~/.ipython/profile_pyspark/ipython_notebook_config.py + ``` + ```bash + c = get_config() + + # kernel configuration + c.IPKernelApp.pylab = 'inline' # set %matplotlib inline always + + # notebook configuration + c.NotebookApp.ip = '*' # '*' == to bind on all IPs + # do not open the browser at start of ipython notebook + # so that we can point the ipython notebook address + # in an active web browser + c.NotebookApp.open_browser = False + + # (optional) you can add password to your notebook if desired + + # set a fixed port number that does not conflict with other iPython profiles + c.NotebookApp.port = 8880 + ``` +6. Create PySpark Setup configuration + ```bash + vim ~/.ipython/profile_pyspark/startup/00-pyspark-setup.py + ``` + ```bash + import os + import sys + import findspark + + # setup spark home + findspark.init() + spark_home = findspark.find() + + # setup spark home approach #2 + # make sure you have already set $SPARK_HOME in $PATH + # spark_home = os.environ.get('SPARK_HOME', None) + + # add spark's home directory to path + sys.path.insert(0, os.path.join(spark_home, "python")) + + # add py4j to path + sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.8.2.1-src.zip")) + + # initialize pyspark to predefine the SparkContext variable "sc" + execfile(os.path.join(spark_home, "python/pyspark/shell.py")) + ``` + +7. Run iPython notebook in your desired directory + ```bash + ipython notebook --profile=pyspark + ``` + +8. Test to see if sc is defined. If not, setup the SparkContext and SQLContext by doing the following in your iPython notebook + ```bash + from pyspark import SparkContext + from pyspark.sql import SQLContext + + # setup SparkContext + try: + sc = SparkContext() + except: + sc = SparkContext._active_spark_context + + # setup SQLContext + sqlCtx = SQLContext(sc) + ``` + +9. When you are reading your JSON, you need to determine your fs.default.name or fs.defaultFS. You can figure this out by checking out the core-site.xml file. This can be found in Mac OS at /usr/local/Cellar/hadoop//libexec/etc/hadoop/core-site.xml. To read JSON using SQLContext, you have to add this ip address when calling the function. + +For example: your fs.default.name or fs.defaultFS is hdfs://localhost:9000. To use one of the JSON files that you have put into the datasets directory in HDFS, you have to call as follows: + ```bash + dataframe = sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_1m_movies.json.gz") + ``` diff --git a/hermes/__init__.py b/hermes/__init__.py new file mode 100644 index 0000000..33d7f0e --- /dev/null +++ b/hermes/__init__.py @@ -0,0 +1,7 @@ +__version__ = '1.0' +import hermes +import hermesctl +import hermesui +import modules +import algorithms +import utils \ No newline at end of file diff --git a/hermes/algorithms/__init__.py b/hermes/algorithms/__init__.py new file mode 100644 index 0000000..70fb85c --- /dev/null +++ b/hermes/algorithms/__init__.py @@ -0,0 +1,2 @@ +import content_based +import performance_metrics \ No newline at end of file diff --git a/src/algorithms/cf.py b/hermes/algorithms/cf.py similarity index 100% rename from src/algorithms/cf.py rename to hermes/algorithms/cf.py diff --git a/src/algorithms/content_based.py b/hermes/algorithms/content_based.py similarity index 100% rename from src/algorithms/content_based.py rename to hermes/algorithms/content_based.py diff --git a/src/algorithms/content_based_kmeans.py b/hermes/algorithms/content_based_kmeans.py similarity index 100% rename from src/algorithms/content_based_kmeans.py rename to hermes/algorithms/content_based_kmeans.py diff --git a/src/data_prep/movieLens_vectorize.py b/hermes/algorithms/data_prep/movieLens_vectorize.py similarity index 100% rename from src/data_prep/movieLens_vectorize.py rename to hermes/algorithms/data_prep/movieLens_vectorize.py diff --git a/src/data_prep/wiki_vectorize.py b/hermes/algorithms/data_prep/wiki_vectorize.py similarity index 100% rename from src/data_prep/wiki_vectorize.py rename to hermes/algorithms/data_prep/wiki_vectorize.py diff --git a/src/algorithms/performance_metrics.py b/hermes/algorithms/performance_metrics.py similarity index 83% rename from src/algorithms/performance_metrics.py rename to hermes/algorithms/performance_metrics.py index ae58826..934df79 100644 --- a/src/algorithms/performance_metrics.py +++ b/hermes/algorithms/performance_metrics.py @@ -13,11 +13,48 @@ from sklearn.metrics import jaccard_similarity_score import itertools +def get_perform_metrics(y_test, y_train, y_predicted, content_array, sqlCtx, num_predictions=100, num_partitions=30): + results = {} + + #because some of the algorithms we will use will only return n predictions per user all results should be analyazed for n recommendations + n_predictions = predictions_to_n(y_predicted, number_recommended=num_predictions) + + results['rmse'] = calculate_rmse_using_rdd(y_test, n_predictions) + results['mae'] = calculate_mae_using_rdd(y_test,n_predictions) + results['pred_n'] = calculate_precision_at_n(y_test, n_predictions, number_recommended=num_predictions) + + #measures of diversity + results['cat_diversity'] = calculate_population_category_diversity(n_predictions, content_array) + results['ils'] = calc_ils(n_predictions, content_array, num_partitions=num_partitions) + + #measures of coverage + results['cat_coverage'] = calculate_catalog_coverage(y_test, y_train, n_predictions) + results['item_coverage'] = calculate_item_coverage(y_test, y_train, n_predictions) + results['user_coverage'] = calculate_user_coverage(y_test, y_train, n_predictions) + results['pred_coverage'] = calculate_prediction_coverage(y_test, n_predictions) + + #measures of serendipity + results['serendipity'] = calculate_serendipity(y_train, y_test, n_predictions, sqlCtx, rel_filter=1) + results['content_serendipity'] = calc_content_serendipity(y_test, n_predictions, content_array, sqlCtx) + + #measures of novelty + results['novelty'] = calculate_novelty(y_train, y_test, n_predictions, sqlCtx) + + #relevancy statistics + rel_stats = calc_relevant_rank_stats(y_test, n_predictions, sqlCtx) + results['avg_highest_rank'] = rel_stats[0] + results['avg_mean_rank'] = rel_stats[1] + results['avg_lowest_rank'] = rel_stats[2] + + return results + + + # Accuracy of ratings predictions (aka regression metrics) ===================== # RMSE ----------------------------------------------------------------- -def calculate_rmse_using_rdd(y_actual, y_predicted): +def calculate_rmse(y_actual, y_predicted): """ Determines the Root Mean Square Error of the predictions. @@ -36,26 +73,11 @@ def calculate_rmse_using_rdd(y_actual, y_predicted): sum_ratings_diff_sq = ratings_diff_sq.reduce(add) num = ratings_diff_sq.count() - return sqrt(sum_ratings_diff_sq / float(num) ) - -def calculate_rmse_using_array(y_actual, y_predicted): - """ - Determines the Root Mean Square Error of the predictions. - - Args: - y_actual: actual ratings in the format of an array of [ (userId, itemId, actualRating) ] - y_predicted: predicted ratings in the format of an array of [ (userId, itemId, predictedRating) ] - - Assumptions: - y_actual and y_predicted are in the same order. - - """ - return sqrt(mean_squared_error(y_actual, y_predicted)) - #return mean_squared_error(y_actual, y_predicted) ** 0.5 + return sqrt(sum_ratings_diff_sq / float(num)) # MAE ------------------------------------------------------------------ -def calculate_mae_using_rdd(y_actual, y_predicted): +def calculate_mae(y_actual, y_predicted): """ Determines the Mean Absolute Error of the predictions. @@ -80,11 +102,32 @@ def calculate_mae_using_rdd(y_actual, y_predicted): # Performance, Recall, Fbeta Score, Support -def calculate_prfs_using_rdd(y_actual, y_predicted): - # TODO: it is highly dependent on the labels - ## The actual and predicted interactions also need to be boolean of [interaction, no_interaction] for the sklearn precision_recall_fscore_support` - ## A better metric for recommender systems is precision at N - return +def calculate_prfs_using_rdd(y_actual, y_predicted, average='macro'): + """ + Determines the precision, recall, fscore, and support of the predictions. + With average of macro, the algorithm Calculate metrics for each label, and find their unweighted mean. + See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html for details + + A better metric for recommender systems is precision at N (also in this package) + + Args: + y_actual: actual ratings in the format of an RDD of [ (userId, itemId, actualRating) ] + y_predicted: predicted ratings in the format of an RDD of [ (userId, itemId, predictedRating) ] + + Returns: + precision, recall, fbeta_score, and support values + + """ + + prediction_rating_pairs = y_predicted.map(lambda x: ((x[0], x[1]), x[2]))\ + .join(y_actual.map(lambda x: ((x[0], x[1]), x[2])))\ + .map(lambda ((user, item), (prediction, rating)): (user, item, prediction, rating)) + + true_vals = np.array(prediction_rating_pairs.map(lambda (user, item, prediction, rating): rating).collect()) + pred_vals = np.array(prediction_rating_pairs.map(lambda (user, item, prediction, rating): prediction).collect()) + + return precision_recall_fscore_support(map(lambda x: int(np.round(x)), true_vals),\ + map(lambda x: int(np.round(x)), pred_vals), average = average) def calculate_precision_at_n(y_actual, y_predicted, number_recommended = 100): """ @@ -240,16 +283,17 @@ def calc_user_ILS(item_list): -def calculate_catalog_coverage(y_actual, y_predicted): +def calculate_catalog_coverage(y_test, y_train, y_predicted): """ Calculates the percentage of user-item pairs that were predicted by the algorithm. - The test data is passed in to determine the total number of potential user-item pairs + The full data is passed in as y_test and y_train to determine the total number of potential user-item pairs Then the predicted data is passed in to determine how many user-item pairs were predicted. It is very important to NOT pass in the sorted and cut prediction RDD and that the algorithm trys to predict all pairs The use the function 'cartesian' as shown in line 25 of content_based.py is helpful in that regard Args: - y_training_data: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + y_test: the data used to test the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + y_train: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] y_predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. It is important that this is not the sorted and cut prediction RDD @@ -258,49 +302,55 @@ def calculate_catalog_coverage(y_actual, y_predicted): """ + y_full_data = y_test.union(y_train) + prediction_count = y_predicted.count() #obtain the number of potential users and items from the actual array as the algorithms cannot predict something that was not trained - num_users = y_actual.map(lambda row: row[0]).distinct().count() - num_items = y_actual.map(lambda row: row[1]).distinct().count() + num_users = y_full_data.map(lambda row: row[0]).distinct().count() + num_items = y_full_data.map(lambda row: row[1]).distinct().count() potential_predict = num_users*num_items catalog_coverage = prediction_count/float(potential_predict)*100 return catalog_coverage -def calculate_item_coverage(y_actual, y_predicted): +def calculate_item_coverage(y_test, y_train, y_predicted): """ Calculates the percentage of users pairs that were predicted by the algorithm. - The test data is passed in to determine the total number of potential items + The full dataset is passed in as y_test and y_train to determine the total number of potential items Then the predicted data is passed in to determine how many users pairs were predicted. It is very important to NOT pass in the sorted and cut prediction RDD Args: - y_actual: actual ratings in the format of an array of [ (userId, itemId, actualRating) ] + y_test: the data used to test the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + y_train: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] y_predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. It is important that this is not the sorted and cut prediction RDD Returns: - user_coverage: value representing the percentage of user ratings that were able to be predicted + item_coverage: value representing the percentage of user ratings that were able to be predicted """ + y_full_data = y_test.union(y_train) + predicted_items = y_predicted.map(lambda row: row[1]).distinct().count() #obtain the number of potential users and items from the actual array as the algorithms cannot predict something that was not trained - num_items = y_actual.map(lambda row: row[1]).distinct().count() + num_items = y_full_data.map(lambda row: row[1]).distinct().count() item_coverage = predicted_items/float(num_items)*100 return item_coverage -def calculate_user_coverage(y_actual, y_predicted): +def calculate_user_coverage(y_test, y_train, y_predicted): """ Calculates the percentage of users that were predicted by the algorithm. - The test data is passed in to determine the total number of potential users + The full dataset is passed in as y_test and y_train to determine the total number of potential users Then the predicted data is passed in to determine how many users pairs were predicted. It is very important to NOT pass in the sorted and cut prediction RDD Args: - y_actual: actual ratings in the format of an array of [ (userId, itemId, actualRating) ] + y_test: the data used to test the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] + y_train: the data used to train the RecSys algorithm in the format of an RDD of [ (userId, itemId, actualRating) ] y_predicted: predicted ratings in the format of a RDD of [ (userId, itemId, predictedRating) ]. It is important that this is not the sorted and cut prediction RDD @@ -308,10 +358,11 @@ def calculate_user_coverage(y_actual, y_predicted): user_coverage: value representing the percentage of user ratings that were able to be predicted """ + y_full_data = y_test.union(y_train) predicted_users = y_predicted.map(lambda row: row[0]).distinct().count() #obtain the number of potential users and items from the actual array as the algorithms cannot predict something that was not trained - num_users = y_actual.map(lambda row: row[0]).distinct().count() + num_users = y_full_data.map(lambda row: row[0]).distinct().count() user_coverage = predicted_users/float(num_users)*100 @@ -343,7 +394,7 @@ def calculate_prediction_coverage(y_actual, y_predicted): return prediction_coverage -def calculate_serendipity(y_train, y_test, y_predicted, rel_filter=1): +def calculate_serendipity(y_train, y_test, y_predicted, sqlCtx, rel_filter=1): """ Calculates the serendipity of the recommendations. This measure of serendipity in particular is how surprising relevant recommendations are to a user @@ -438,7 +489,7 @@ def calculate_serendipity(y_train, y_test, y_predicted, rel_filter=1): return (average_overall_serendipity, average_serendipity) -def calculate_novelty(y_train, y_test, y_predicted): +def calculate_novelty(y_train, y_test, y_predicted, sqlCtx): """ Novelty measures how new or unknown recommendations are to a user An individual item's novelty can be calculated as the log of the popularity of the item @@ -504,7 +555,7 @@ def prob_by_rank(rank, n): prob = (n-rank)/float(n-1) return prob -def calc_content_serendipity(y_actual, y_predicted, content_array): +def calc_content_serendipity(y_actual, y_predicted, content_array, sqlCtx): """ Calculates the serendipity of the recommendations based on their content. This measure of serendipity in particular is how surprising relevant recommendations are to a user @@ -590,7 +641,7 @@ def calc_jaccard_diff(array_1, array_2): #otherwise a numpy float is returned which causes chaos and havoc to ensue return float(dist) -def calc_relevant_rank_stats(y_actual, y_predicted): +def calc_relevant_rank_stats(y_actual, y_predicted, sqlCtx): """ Determines the average minimum, average and maximum ranking of 'relevant' items 'Relevant' here means that the item was rated, i.e., it exists in the y_actual RDD @@ -633,4 +684,4 @@ def calc_relevant_rank_stats(y_actual, y_predicted): rank_stats = np.mean(max_ranks_local, axis=0) - return rank_stats \ No newline at end of file + return rank_stats diff --git a/src/algorithms/recommender_helpers.py b/hermes/algorithms/recommender_helpers.py similarity index 100% rename from src/algorithms/recommender_helpers.py rename to hermes/algorithms/recommender_helpers.py diff --git a/hermes/configs/list_of_files.ini b/hermes/configs/list_of_files.ini new file mode 100644 index 0000000..9c71011 --- /dev/null +++ b/hermes/configs/list_of_files.ini @@ -0,0 +1,17 @@ +[wiki] +# It looks like we will have to support globing, or else listing the Wikipedia +# data files is going to be a Herculean task +edit_history = /Users/tiffanyj/datasets/wikipedida/edits/*json.gz +full_text = /Users/tiffanyj/datasets/wikipedia/fulltext/*json.gz +[movielens] +# 20M data +movielens_20m_ratings=/data/ml/20m/ratings.json.gz +movielens_20m_tags=/data/ml/20m/tags.json.gz +movielens_20m_movies=/data/ml/20m/movies.json.gz +# 10M data +movielens_10m_ratings=/data/ml/10m/ratings.json.gz +movielens_10m_tags=/data/ml/10m/tags.json.gz +movielens_10m_movies=/data/ml/10m/movies.json.gz +# 1M data +movielens_1m_ratings=/data/ml/10m/ratings.json.gz +movielens_1m_movies=/data/ml/10m/movies.json.gz \ No newline at end of file diff --git a/hermes/configs/movielens_config.ini b/hermes/configs/movielens_config.ini new file mode 100644 index 0000000..4a76a87 --- /dev/null +++ b/hermes/configs/movielens_config.ini @@ -0,0 +1,18 @@ +[datasets] +dataname = movielens +# user vectors +user_vector_data = ["movielens_10m_ratings"] +user_vector_schemas = ["movielens_10m_ratings_schema"] +user_vector_transformations = ["ratings"] +# content vectors +#content_vector_data = ["movielens_10m_movies"] +#content_vector_schemas = ["movielens_10m_movies_schema"] +#content_vector_transformations = ["genre"] + +[recommenders] +user_recommenders = ["ALS"] +#content_recommenders = [""] + + +[metrics] +metrics = ["RMSE", "MAE"] \ No newline at end of file diff --git a/hermes/configs/my_list_of_files.ini b/hermes/configs/my_list_of_files.ini new file mode 100644 index 0000000..74ff57f --- /dev/null +++ b/hermes/configs/my_list_of_files.ini @@ -0,0 +1,10 @@ +[movielens] +# 10M data +movielens_10m_ratings = /Users/tiffanyj/datasets/movielens/movielens_10m_ratings.json.gz +movielens_10m_tags = /Users/tiffanyj/datasets/movielens/movielens_10m_tags.json.gz +movielens_10m_movies = /Users/tiffanyj/datasets/movielens/movielens_10m_movies.json.gz + +# 10M schema +movielens_10m_ratings_schema = /Users/tiffanyj/datasets/movielens/movielens_20m_ratings_schema.json +movielens_10m_tags_schema = /Users/tiffanyj/datasets/movielens/movielens_20m_tags_schema.json +movielens_10m_movies_schema = /Users/tiffanyj/datasets/movielens/movielens_20m_movies_schema.json \ No newline at end of file diff --git a/hermes/configs/wiki_config.ini b/hermes/configs/wiki_config.ini new file mode 100644 index 0000000..56a5800 --- /dev/null +++ b/hermes/configs/wiki_config.ini @@ -0,0 +1,19 @@ +[meta] +# TODO: still has not handle meta +output_directory = /output/wikipedia_cbkmeans + +[datasets] +dataname = wiki +user_vector_data = ["edit_history"] +user_vector_transformations = ["num_edits_ceil"] +content_vector_data = ["full_text"] +content_vector_transformations = ["glove_model"] +glove_model = /data/glove/glove.txt + +[recommenders] +recommenders = ["CBWithKMeans"] +# TODO: still has not handle additional variables like top_n +top_n = 20 + +[metrics] +metrics = ["RMSE", "MAE"] \ No newline at end of file diff --git a/hermes/data_prep/movieLens_vectorize.py b/hermes/data_prep/movieLens_vectorize.py new file mode 100644 index 0000000..1d038bb --- /dev/null +++ b/hermes/data_prep/movieLens_vectorize.py @@ -0,0 +1,100 @@ +import numpy as np + +class movieLens_vectorize(): + + def __init__(self, user_interactions, content, user_vector_type, content_vector_type, **support_files ): + """ + Class initializer to load the required files + + Args: + user_interactions: The raw RDD of the user interactions. For MovieLens, these are the ratings + content: The raw RDD containing the item content. For MovieLens, this is the movie categories + user_vector_type: The type of user vector desired. For MovieLens you can choose between ['ratings', 'pos_ratings', 'ratings_to_interact', 'none']. + If 'none' is used then this means you will run your own custom mapping + content_vector_type: The type of content vector desired. For MovieLens you can choose between ['genre', 'none']. + If none is chosen no content vector will be returned and None may be passed into the content argument. + You do not need a content vector to run pure CF only but some performance metrics will not be able to be ran + support_files: If they exist, the supporting files, dataFrames, and/or file links necessary to run the content vectors. + + + """ + self.user_vector_type = user_vector_type + self.content_vector_type = content_vector_type + + #Filter out uninteresting articles and users if they still exist in the dataset + self.user_interactions =user_interactions + self.user_interactions.registerTempTable("ratings") + self.content = content + self.content.registerTempTable("content") + + #if no support files were passed in, initialize an empty support file + if support_files: + self.support_files = support_files + else: + self.support_files = {} + + + def get_user_vector(self): + + if self.user_vector_type=='ratings': + user_info = self.user_interactions.map(lambda row: (row.user_id, row.movie_id, row.rating) ) + return user_info + + elif self.user_vector_type=='pos_ratings': + user_info = self.user_interactions.map(lambda row: (row.user_id, row.movie_id, row.rating) ).filter(lambda (u,m,r): r>3) + return user_info + + elif self.user_vector_type=='ratings_to_interact': + user_info = self.user_interactions.map(lambda row: (row.user_id, row.movie_id, rating_to_interaction(row.rating)) ) + return user_info + + elif self.user_vector_type=='none': + return None + + else: + print "Please choose a user_vector_type between 'ratings', 'pos_ratings', 'ratings_to_interact', and 'none'" + return None + + def get_content_vector(self): + + if self.content_vector_type=='genre': + content_array = self.content.map(lambda row: (row.movie_id, genre_vectorizer(row))) + return content_array + + elif self.content_vector_type=='none': + return None + + else: + print "Please choose a content_vector_type between 'genre' or 'none'" + return None + + + +def rating_to_interaction(rating): + if rating<3: + return -1 + else: + return 1 + + +def genre_vectorizer(row): + return np.array(( + int(row.genre_action), + int(row.genre_adventure), + int(row.genre_animation), + int(row.genre_childrens), + int(row.genre_comedy), + int(row.genre_crime), + int(row.genre_documentary), + int(row.genre_drama), + int(row.genre_fantasy), + int(row.genre_filmnoir), + int(row.genre_horror), + int(row.genre_musical), + int(row.genre_mystery), + int(row.genre_romance), + int(row.genre_scifi), + int(row.genre_thriller), + int(row.genre_war), + int(row.genre_western), + )) \ No newline at end of file diff --git a/hermes/data_prep/osm_vectoize.py b/hermes/data_prep/osm_vectoize.py new file mode 100644 index 0000000..9bea71c --- /dev/null +++ b/hermes/data_prep/osm_vectoize.py @@ -0,0 +1,213 @@ +import numpy as np + +class osm_vectorize(): + + def __init__(self, user_interactions, user_vector_type, content_vector_type, sqlCtx, **support_files ): + """ + Class initializer to load the required files + + Args: + user_interactions: The raw RDD of the user interactions. For OSM, these are the object edits as well as the object data + user_vector_type: The type of user vector desired. For MovieLens you can choose between ['ratings', 'pos_ratings', 'ratings_to_interact', 'none']. + If 'none' is used then this means you will run your own custom mapping + content_vector_type: The type of content vector desired. For MovieLens you can choose between ['tags_only', 'none']. + If none is chosen no content vector will be returned and None may be passed into the content argument. + You do not need a content vector to run pure CF only but some performance metrics will not be able to be ran + support_files: If they exist, the supporting files, dataFrames, and/or file links necessary to run the content vectors. + + + """ + self.user_vector_type = user_vector_type + self.content_vector_type = content_vector_type + self.sqlCtx = sqlCtx + + #Filter out uninteresting items and users if they still exist in the dataset + self.user_interactions =user_interactions + self.user_interactions.registerTempTable("osm_data") + + filtered = self.sqlCtx.sql("select * from osm_data where id is not Null and uid is not Null") + filtered.registerTempTable("filtered_osm") + + #if no support files were passed in, initialize an empty support file + if support_files: + self.support_files = support_files + else: + self.support_files = {} + + + def get_user_vector(self): + + if self.user_vector_type=='ratings': + user_info = self.sqlCtx.sql("select uid, id, count(1) as rating from filtered_osm group by uid, id")\ + .map(lambda (user, item, interact):(int(user), int(item), interact)) + return user_info + + elif self.user_vector_type=='any_interact': + user_info = self.sqlCtx.sql("select uid, id, 1 as rating from filtered_osm group by uid, id")\ + .map(lambda (user, item, interact):(int(user), int(item), interact)) + return user_info + + elif self.user_vector_type=='num_edits_ceil': + user_info = self.sqlCtx.sql("select uid, id, count(1) as rating from filtered_osm group by uid, id") \ + .map(lambda (user, item, interact) : (user, int(item), min(interact, 5))) + return user_info + + elif self.user_vector_type=='none': + return None + + else: + print "Please choose a user_vector_type between 'ratings', 'any_interact', 'num_edits_ceil', and 'none'" + return None + + def get_content_vector(self): + + if self.content_vector_type=='tags_only': + content_array = self.content.map(lambda row: (row.id, osm_vectorize(row)))\ + .groupByKey().map(lambda (id, vectors): (id, np.array(list(vectors)).max(axis=0))) + return content_array + + elif self.content_vector_type=='none': + return None + + else: + print "Please choose a content_vector_type between 'tags_only' or 'none'" + return None + + + + +def osm_vectorize(row): + vect = [] + if row.source is not None: + vect.append(1) + else: + vect.append(0) + if row.building is not None: + vect.append(1) + else: + vect.append(0) + if row.highway is not None: + vect.append(1) + else: + vect.append(0) + if row.name is not None: + vect.append(1) + else: + vect.append(0) + if row.addr_city is not None: + vect.append(1) + else: + vect.append(0) + if row.addr_postcode is not None: + vect.append(1) + else: + vect.append(0) + if row.natural is not None: + vect.append(1) + else: + vect.append(0) + if row.landuse is not None: + vect.append(1) + else: + vect.append(0) + if row.surface is not None: + vect.append(1) + else: + vect.append(0) + if row.waterway is not None: + vect.append(1) + else: + vect.append(0) + if row.power is not None: + vect.append(1) + else: + vect.append(0) + if row.wall is not None: + vect.append(1) + else: + vect.append(0) + if row.oneway is not None: + vect.append(1) + else: + vect.append(0) + if row.amenity is not None: + vect.append(1) + else: + vect.append(0) + if row.ref is not None: + vect.append(1) + else: + vect.append(0) + if row.building_levels is not None: + vect.append(1) + else: + vect.append(0) + if row.maxspeed is not None: + vect.append(1) + else: + vect.append(0) + if row.barrier is not None: + vect.append(1) + else: + vect.append(0) + if row.type is not None: + vect.append(1) + else: + vect.append(0) + if row.place is not None: + vect.append(1) + else: + vect.append(0) + if row.foot is not None: + vect.append(1) + else: + vect.append(0) + if row.bicycle is not None: + vect.append(1) + else: + vect.append(0) + if row.railway is not None: + vect.append(1) + else: + vect.append(0) + if row.leisure is not None: + vect.append(1) + else: + vect.append(0) + if row.bridge is not None: + vect.append(1) + else: + vect.append(0) + if row.parking is not None: + vect.append(1) + else: + vect.append(0) + if row.man_made is not None: + vect.append(1) + else: + vect.append(0) + if row.railway is not None: + vect.append(1) + else: + vect.append(0) + if row.aeroway is not None: + vect.append(1) + else: + vect.append(0) + if row.wikipedia is not None: + vect.append(1) + else: + vect.append(0) + if row.osm_type =='Node': + vect.append(1) + else: + vect.append(0) + if row.osm_type =='Way': + vect.append(1) + else: + vect.append(0) + if row.osm_type =='Relation': + vect.append(1) + else: + vect.append(0) + return vect diff --git a/hermes/data_prep/wiki_vectorize.py b/hermes/data_prep/wiki_vectorize.py new file mode 100644 index 0000000..6510d53 --- /dev/null +++ b/hermes/data_prep/wiki_vectorize.py @@ -0,0 +1,161 @@ +from src.utils import article_to_category, glove, remove_templates, clean_categories, clean_links +import string +import numpy as np + +class wiki_vectorize(): + + def __init__(self, user_interactions, content, user_vector_type, content_vector_type, sqlCtx, **support_files): + """ + Class initializer to load the required files + + Args: + user_interactions: The raw RDD of the user interactions. For Wikipedia, this it is the full edit history. + We have been reading it in as wiki_edits = sqlCtx.read.json(wiki_edit_json_data_path, schema=schema) + content: The raw RDD containing the item content. For Wikipedia, this is the latest edit which contains full article content + user_vector_type: The type of user vector desired. For Wikipedia you can choose between ['num_edits', 'any_interact', 'num_edits_ceil', 'none']. + num_edits_ceil will count the number of edits but set an upper limit of 5 edits + If 'none' is used then this means you will run your own custom mapping + content_vector_type: The type of content vector desired. For Wikipedia you can choose between ['glove', 'category_map', 'none']. + If none is chosen no content vector will be returned and None may be passed into the content argument. + You do not need a content vector to run pure CF only but some performance metrics will not be able to be ran + support_files: If they exist, the supporting files, dataFrames, and/or file links necessary to run the content vectors. + For example the category_map function at least needs the category_list from dbPedia + + """ + self.user_vector_type = user_vector_type + self.content_vector_type = content_vector_type + self.sqlCtx = sqlCtx + + #Filter out uninteresting articles and users if they still exist in the dataset + user_interactions.registerTempTable("ratings") + content.registerTempTable("content") + + filtered = self.sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") + filtered_content = self.sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") + + self.filtered = filtered + self.filtered.registerTempTable("wiki_ratings") + + self.filtered_content = filtered_content + self.filtered_content.registerTempTable("wiki_content") + + #if no support files were passed in, initialize an empty support file + if support_files: + self.support_files = support_files + else: + self.support_files = {} + + + def get_user_vector(self): + + if self.user_vector_type=='num_edits': + user_info = self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings \ + group by user_id, article_id") + + return user_info + + elif self.user_vector_type=='any_interact': + user_info = self.sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings \ + group by user_id, article_id") + + return user_info + + elif self.user_vector_type=='num_edits_ceil': + user_info = self.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki \ + group by user_id, article_id")\ + .map(lambda (user, article, rating): (user, article, min(rating, 5))) + + return user_info + + elif self.user_vector_type=='none': + return None + + else: + print "Please choose a user_vector_type between num_edits, any_interact, num_edits_ceil or none" + return None + + + def get_content_vector(self): + if self.content_vector_type=='glove': + + if self.support_files==1: + glove_model = self.support_files["glove_model"] + + article_mapping = self.filtered_content\ + .map(lambda row: (row.article_id, remove_templates(row.full_text)))\ + .map(lambda tup: (tup[0],clean_categories(tup[1])))\ + .map(lambda tup: (tup[0],clean_links(tup[1])))\ + .map( + lambda tup: + (tup[0], tup[1]\ + .replace('\n', ' ')\ + .replace("", '')\ + .replace("", '')\ + ) + )\ + .map(lambda tup: (tup[0], remove_punctuation(tup[1])))\ + .map(lambda tup: (tup[0], remove_urls(tup[1])))\ + .map(lambda tup: (tup[0], article_to_glove(tup[1], glove_model))) + + return article_mapping + + else: + print "Please pass in a glove_model. Like: support_files['glove_model']=Glove('glove.6B.50d.txt')" + elif self.content_vector_type=='category_map': + + if len(self.support_files)==3: + #The category map supporting dataFrames and objects are as followed: + #high_level_idx: An array of the high level categories to map to e.g. ['Concepts', 'Life', 'Physical_universe', 'Society'] + #category_index_graph_link: Path to the csv of the category links as created from wiki_categories.create_linked_list() + #category_idx: Dictionary of the categories to an index as created from wiki_categories.create_category_idx_dicts() + + high_level_categories = self.support_files['high_level_categories'] + category_index_graph_link = self.support_files['category_index_graph_link'] + category_idx = self.support_file['category_idx'] + + ac = article_to_category(high_level_categories, category_index_graph_link, category_idx) + article_mapping = ac.run_mapping(self.filtered_content) + + return article_mapping + + else: + #print "To run category map you must at least have the category_list from dbPedia" + ##TODO work on the article_to_category function so that it can just pull in the category list from dpPedia + print "Please pass in the following files:" + print "high_level_idx: An array of the high level categories to map to e.g. ['Concepts', 'Life', 'Physical_universe', 'Society']" + print 'category_index_graph_link: Path to the csv of the category links as created from wiki_categories.create_linked_list()' + print 'category_idx: Dictionary of the categories to an index as created from wiki_categories.create_category_idx_dicts()' + print 'support_files = {"high_level_categories" : high_level_categories, \ + "category_index_graph_link" : category_index_graph_link, \ + "category_idx" : category_idx}' + return None + + elif self.content_vector_type=='none': + return None + + else: + print "Please choose between glove, category_map or none" + return None + +def remove_punctuation(text): + for char in string.punctuation: + text = text.replace(char, '') + return text + +def article_to_glove(text, model): + vec = np.zeros(model.vector_size) + for word in text.split(): + vec += model[word.lower()] + + return vec + +def remove_urls(text): + stext = text.split() + next_text = [] + for word in stext: + if word.startswith('http'): + continue + else: + next_text.append(word) + + return ' '.join(next_text) \ No newline at end of file diff --git a/src/examples/cf_example.py b/hermes/examples/cf_example.py similarity index 98% rename from src/examples/cf_example.py rename to hermes/examples/cf_example.py index d18d4fe..bb408a8 100644 --- a/src/examples/cf_example.py +++ b/hermes/examples/cf_example.py @@ -18,12 +18,12 @@ from sklearn.cross_validation import train_test_split from sklearn.cross_validation import StratifiedShuffleSplit -sys.path.append("../algorithms") +sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/" + "..")) -import performance_metrics as pm -import content_based as cb -from singleton import SCSingleton -from timer import Timer +import metrics.performance_metrics as pm +import metrics.content_based as cb +from modules.singleton import SCSingleton +from modules.timer import Timer """ This entire file is to provide a basic understanding of collaborative filtering diff --git a/hermes/hermes.py b/hermes/hermes.py new file mode 100644 index 0000000..cd8c40a --- /dev/null +++ b/hermes/hermes.py @@ -0,0 +1,198 @@ +"""Defined states in Hermes's state machine""" + +import json +import logging +import os + +import hermesui +import modules.helper as helper +import modules.metricgenerator as mg +import modules.recommendergenerator as rg +import modules.vectorgenerator as vg + +from modules.hermesglobals import Globals +from modules.timer import Timer + +# TODO: empty certain items in cargo after no longer needed? +# TODO: when to use error_state? do try-catch for all states? + +def __start(cargo): + """start_state without the state machine.""" + + if Globals.verbose: Globals.logger.debug("In start_state:") + + if Globals.verbose: Globals.logger.debug("Creating the hdfs directory " + cargo.hdfs_dir) + os.system("hdfs dfs -mkdir " + cargo.hdfs_dir) + + def load_json_files(datas): + for i in range(0, len(datas)): + json_path = datas[i].datapath + if Globals.verbose: Globals.logger.debug("Loading JSON file " + json_path + " into hdfs directory " + cargo.hdfs_dir) + os.system("hdfs dfs -put " + json_path + " " + cargo.hdfs_dir + "/" + os.path.basename(json_path)) + + load_json_files(cargo.datas) + +def start_state(cargo): + """Start of the state machine. Create HDFS directory and upload the input data. + Returns: json_to_rdd_state as next state + """ + + __start(cargo) + + newState = json_to_rdd_state + if Globals.verbose: Globals.logger.debug("start_state -> json_to_rdd_state") + + return newState, cargo + +# TODO: make json_to_rdd_state, split_data_state, and make_prediction_state into one state? +def __json_to_rdd(cargo): + """json_to_rdd_state without the state macine.""" + + if Globals.verbose: Globals.logger.debug("In json_to_rdd_state:") + + # create RDD for each JSON file and store it in Cargo's vectors list + for i in range(0, len(cargo.datas)): + data = cargo.datas[i] + if Globals.verbose: Globals.logger.debug("Working with json file %s" % data.datapath) + + if Globals.verbose: Globals.logger.debug("Creating dataframe based on the content of the json file") + datapath_in_hdfs = "hdfs://" + cargo.fs_default_ip_addr + "/" + cargo.hdfs_dir + "/" + os.path.basename(data.datapath) + data.set_dataframe(Globals.scsingleton.sc, Globals.scsingleton.sqlCtx, datapath_in_hdfs) + + if Globals.verbose: Globals.logger.debug("Creating RDD based on the computed dataframe and configuration provided by the user") + cargo.vectors.append( vg.VectorFactory().create_obj_vector(data, cargo.support_files) ) + + # TODO: clean cargo? + # cargo.datas = [] + # cargo.hdfs_dir = None + # cargo.fs_default_ip_addr = None + +def json_to_rdd_state(cargo): + """Parse JSON to RDD. + Returns: split_data_state as next state + """ + + __json_to_rdd(cargo) + + newState = split_data_state + if Globals.verbose: Globals.logger.debug("json_to_rdd_state -> split_data_state") + + return newState, cargo + +def __split_data(cargo): + """split_data_state without the state machine.""" + + if Globals.verbose: Globals.logger.debug("In split_data_state:") + + for i in range(0, len(cargo.vectors)): + vector = cargo.vectors[i] + weights, seed = hermesui._ask_user_for_split_percentage(vector.data.datapath) + vector.split_data(weights, seed) + +def split_data_state(cargo): + """Split data to train, test, and (optional) validate. + Returns: make_prediction_state as next state + """ + + __split_data(cargo) + + newState = make_prediction_state + if Globals.verbose: Globals.logger.debug("split_data_state -> make_prediction_state") + + return newState, cargo + +def __make_prediction(cargo): + """make_prediction_state without the state machine.""" + + if Globals.verbose: Globals.logger.debug("In make_prediction_state:") + + for i in range(0, len(cargo.vectors)): + thisvector = cargo.vectors[i] + + # select which recommenders based on the vector type + recommenders = None + thisvector_uservector = None + thisvector_contentvector = None + if helper.is_direct_subclass(thisvector, vg.UserVector): + if Globals.verbose: Globals.logger.debug("Iterating through recommenders for user vector on data %s", thisvector.data.datapath) + thisvector_uservector = thisvector + recommenders = cargo.user_recommenders + elif helper.is_direct_subclass(thisvector, vg.ContentVector): + if Globals.verbose: Globals.logger.debug("Iterating through recommenders for content vector on data %s", thisvector.data.datapath) + thisvector_contentvector = thisvector + thisvector_uservector = thisvector.uservector + recommenders = cargo.content_recommenders + + # run all recommenders on the vector + for r in recommenders: + if Globals.verbose: Globals.logger.debug("Making recommendation %s on data %s", r, thisvector.data.datapath) + # TODO: implement other use case, ie. WithTfidf(), etc. + recommender = rg.RecommenderFactory().create_obj_recommender(r, thisvector_uservector, thisvector_contentvector) + # default use case + # recommender = RecommenderFactory().create_obj_recommender(r, vector, Default()) + # with tf-idf use case + # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithTfidf()) + # without tf-idf use case + # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithoutTfidf()) + # etc. + with Timer() as t: + prediction_vector = recommender.make_prediction() + if Globals.verbose: Globals.logger.debug("Making prediction takes %s seconds" % t.secs) + +def make_prediction_state(cargo): + """Develop model based on the train data and make prediction based on this model. + Returns: calculate_metrics_state as next state + """ + + __make_prediction(cargo) + + newState = calculate_metrics_state + if Globals.verbose: Globals.logger.debug("make_prediction_state -> calculate_metrics_state") + + return newState, cargo + +def __calculate_metrics(cargo): + """calculate_metrics_state without the state machine.""" + + if Globals.verbose: Globals.logger.debug("In calculate_metrics_state:") + + # create a metric executor + executor = mg.MetricExecutor(mg.Metric()) + + for i in range(0, len(cargo.vectors)): + Globals.logger.info("-" * 80) + Globals.logger.info("Data: %s" % cargo.vectors[i].data.datapath) + for m in cargo.metrics: + Globals.logger.info("Metric: %s" % (m)) + # check if metric exists + metric = mg.MetricFactory().create_obj_metric(m) + Globals.logger.info(metric) + # set metric in executor + executor.change_metric(metric) + # execute the metric + with Timer() as t: + Globals.logger.info("Metric: %s = %f" % (m, executor.execute(cargo.vectors[i]))) + if Globals.verbose: Globals.logger.debug("Calculating metric takes %s seconds" % t.secs) + Globals.logger.info("-" * 80) + +def calculate_metrics_state(cargo): + """Test the metrics specified by the user. This is an end state. + Returns: None because this is the last state + """ + + __calculate_metrics(cargo) + + if Globals.verbose: Globals.logger.debug("calculate_metrics_state -> end_state") + + return + +def error_state(cargo): + """Error state. Print out the error messages. This is an end state. + Returns: None because this is the last state + """ + + if Globals.verbose: Globals.logger.debug("In error_state:") + Globals.logger.error("ERROR: " + cargo.error_msg) + if Globals.verbose: Globals.logger.debug("error_state -> end_state") + return + diff --git a/hermes/hermesctl.py b/hermes/hermesctl.py new file mode 100644 index 0000000..b4e1127 --- /dev/null +++ b/hermes/hermesctl.py @@ -0,0 +1,381 @@ +"""Hermes's entry point""" + +import click +import ConfigParser +import itertools +import json +import logging +import sys +from pyspark import SparkConf + +import hermes +import modules.config as config + +from modules.cargo import Cargo +from modules.data import UserVectorData, ContentVectorData +from modules.hermesglobals import Globals +from modules.singleton import SCSingleton +from modules.statemachine import StateMachine + + +def add_states(state_machine): + """ Add states to the given state machine. + + The current implemented state machine follows this path: + json_to_rdd -> split_data -> make_prediction -> calculate_metrics + + Args: + state_machine: state machine + """ + state_machine.add_state(hermes.start_state) + state_machine.add_state(hermes.json_to_rdd_state) + state_machine.add_state(hermes.split_data_state) + state_machine.add_state(hermes.make_prediction_state) + state_machine.add_state(hermes.calculate_metrics_state, isEndState=True) + state_machine.add_state(hermes.error_state, isEndState=True) + state_machine.set_start(hermes.start_state) + return + +def create_logger(name): + """ Create logger with the given name if it's not already created. + + Args: + name: name of logger + Returns: + logger + """ + logger = logging.getLogger(name) + + # check if logger is already created; if not, create it + if not logger.handlers: + logger.setLevel(logging.DEBUG) + # create hermes.log file that prints out debug messages + fh = logging.FileHandler("hermes.log") + fh.setLevel(logging.DEBUG) + # create console handler for stderr that prints out error messages + che = logging.StreamHandler() + che.setLevel(logging.ERROR) + # create console handler for stdout for info, debug, and error level + chod = logging.StreamHandler(sys.stdout) + chod.setLevel(logging.DEBUG) + choe = logging.StreamHandler(sys.stdout) + choe.setLevel(logging.ERROR) + # create formatter and add it to the handlers + formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") + fh.setFormatter(formatter) + che.setFormatter(formatter) + chod.setFormatter(formatter) + choe.setFormatter(formatter) + # add handlers to logger + logger.addHandler(fh) + logger.addHandler(che) + logger.addHandler(chod) + logger.addHandler(choe) + + return logger + +def create_sparkcontext(): + """ Create a single Spark Context with the app name hermes. + + Returns: + SCSingleton: wrapper object that prevents multiple instantiation of the spark context + + """ + conf = SparkConf().setAppName("hermes") + return SCSingleton(conf) + + +# TODO: is there a better way to implement this function? +def extract_configs(configs_path, list_of_files_config_path, cargo): + """ Extract configuration files and store the configurations in cargo. + + Args: + configs_path: list of paths to configuration files + list_of_files_config_path: path to list of files configuration file + cargo: object passed in state machine + + """ + + # extract list_of_files_config + lofcp = ConfigParser.ConfigParser() + lofcp.read(list_of_files_config_path) + + def handle_recognized_section_item(section, item_key, item_value): + """ Helper function that extracts recognized section items. """ + if section == "datasets": + datasets_items[item_key] = item_value + # [datasets] items will be placed into cargo in handle_dataset_section() + return + if section == "recommenders": + if item_key == "user_recommenders": + # add list of recommenders for user vectors into cargo + cargo.user_recommenders.extend( json.loads(item_value) ) + elif item_key == "content_recommenders": + # add list of recommenders for content vectors into cargo + cargo.content_recommenders.extend( json.loads(item_value) ) + return + if section == "metrics": + if item_key == "metrics": + # add list of metrics into cargo + cargo.metrics.extend( json.loads(item_value) ) + return + + def handle_unrecognized_section_item(section, item_key, item_value): + """ Helper function that extracts unrecognized section items. """ + if section == "datasets": + # any unrecognized [datasets] items will be placed in cargo's support_files dictionary + cargo.support_files[item_key] = item_value + return + if section == "recommenders": + Globals.logger.error("ERROR: skip unrecognized item " + item_key + " under section [" + section + "] in config" + config_path) + return + if section == "metrics": + Globals.logger.error("ERROR: skip unrecognized item " + item_key + " under section [" + section + "] in config" + config_path) + return + + def handle_dataset_section(dataset_items, config_path): + """ Helper function that handles [datasets] section. """ + # TODO: which is better? iterating through sections then items or iterating through just items of list_of_files_config? + + # make sure dataname is initialized in order to verify the section in list_of_files_config + if not ("dataname" in datasets_items.keys()): + Globals.logger.error("ERROR: config " + config_path + " must have dataname specified.") + sys.exit() + + dataname = datasets_items["dataname"] + lofmap = config.map_section(lofcp, dataname) + + hasUserVector = False + # check it has the required items to build a UserVectorData + if set(config.REQ_UV_HEADINGS) < set(datasets_items.keys()): + hasUserVector = True + + hasContentVector = False + # check it has the required items to build a ContentVectorData + if set(config.REQ_CV_HEADINGS) < set(datasets_items.keys()): + hasContentVector = True + + if not hasContentVector and not hasUserVector: + Globals.logger.error("ERROR: config " + config_path + " does not have declaration for a user vector or a content vector") + sys.exit() + + if hasContentVector and not hasUserVector: + Globals.logger.error("ERROR: config " + config_path + " does not have declaration for a user vector when a content vector is declared") + sys.exit() + + if hasContentVector and hasUserVector: + # create content vector data + create_datas(lofmap, dataname, datasets_items, config_path, isUserVector=False) + else: + # create user vector data + create_datas(lofmap, dataname, datasets_items, config_path, isUserVector=True) + + def create_datas(lofmap, dataname, datasets_items, config_path, isUserVector): + """ Helper function that creates a UserVectorData or ContentVectorData depending if it isUserVector or not. + + Storing configuration for UserVector or ContentVector in an object (like UserVectorData and ContentVectorData) + is easier than storing its individual parts. UserVectorData and ContentVectorData will be added into cargo in + cargo's data list. + """ + + # TODO: rewrite this, quick fix for now + if isUserVector: + datapaths_heading = "user_vector_data" + vector_transformations_heading = "user_vector_transformations" + schemapaths_heading = "user_vector_schemas" + + datapaths = json.loads(datasets_items[datapaths_heading]) + vector_transformations = json.loads(datasets_items[vector_transformations_heading]) + hasSchemas = False + if schemapaths_heading in datasets_items.keys(): + schemapaths = json.loads(datasets_items[schemapaths_heading]) + hasSchemas = True + + # check that a vector transformation is specified for each data + # TODO: multiple vector trasnformation for each data in the future? + if len(datapaths) != len(vector_transformations): + Globals.logger.error("ERROR: must specify a vector type for each data in config " + config_path) + sys.exit() + + for i in range(0, len(datapaths)): + # set datapath + try: + datapath = lofmap[datapaths[i]] + except KeyError: + Globals.logger.error("ERROR: cannot find data " + datapath + " in the list_of_files_config for config " + config_path) + sys.exit() + # set vector_transformation + vector_transformation = vector_transformations[i] + # set schemapath + try: + if hasSchemas: schemapath = lofmap[schemapaths[i]] + except IndexError, KeyError: + schemapath = None + + uservectordata = UserVectorData(datapath, vector_transformation, schemapath, dataname) + cargo.datas.append(uservectordata) + + else: + # user vector + uv_datapaths_heading = "user_vector_data" + uv_vector_transformations_heading = "user_vector_transformations" + uv_schemapaths_heading = "user_vector_schemas" + + uv_datapaths = json.loads(datasets_items[uv_datapaths_heading]) + uv_vector_transformations = json.loads(datasets_items[uv_vector_transformations_heading]) + uv_hasSchemas = False + if uv_schemapaths_heading in datasets_items.keys(): + uv_schemapaths = json.loads(datasets_items[uv_schemapaths_heading]) + uv_hasSchemas = True + + # content vector + cv_datapaths_heading = "content_vector_data" + cv_vector_transformations_heading = "content_vector_transformations" + cv_schemapaths_heading = "content_vector_schemas" + + cv_datapaths = json.loads(datasets_items[cv_datapaths_heading]) + cv_vector_transformations = json.loads(datasets_items[cv_vector_transformations_heading]) + cv_hasSchemas = False + if cv_schemapaths_heading in datasets_items.keys(): + cv_schemapaths = json.loads(datasets_items[cv_schemapaths_heading]) + cv_hasSchemas = True + + # check that a vector transformation is specified for each data + # TODO: multiple vector trasnformation for each data in the future? + if len(cv_datapaths) != len(cv_vector_transformations) or len(uv_datapaths) != len(uv_vector_transformations): + Globals.logger.error("ERROR: must specify a vector type for each data in config " + config_path) + sys.exit() + + if len(cv_datapaths) != len(uv_datapaths): + Globals.logger.error("ERROR: content vector must have a corresponding user vector") + sys.exit() + + for i in range(0, len(cv_datapaths)): + # set datapath + try: + cv_datapath = lofmap[cv_datapaths[i]] + except KeyError: + Globals.logger.error("ERROR: cannot find data " + cv_datapath + " in the list_of_files_config for config " + config_path) + sys.exit() + try: + uv_datapath = lofmap[uv_datapaths[i]] + except KeyError: + Globals.logger.error("ERROR: cannot find data " + uv_datapath + " in the list_of_files_config for config " + config_path) + sys.exit() + # set vector_transformation + cv_vector_transformation = cv_vector_transformations[i] + uv_vector_transformation = uv_vector_transformations[i] + # set schemapath + try: + if cv_hasSchemas: cv_schemapath = lofmap[cv_schemapaths[i]] + except IndexError, KeyError: + cv_schemapath = None + try: + if uv_hasSchemas: uv_schemapath = lofmap[uv_schemapaths[i]] + except IndexError, KeyError: + uv_schemapath = None + + uservectordata = UserVectorData(uv_datapath, uv_vector_transformation, uv_schemapath, dataname) + contentvectordata = ContentVectorData(cv_datapath, cv_vector_transformation, cv_schemapath, dataname, uservectordata) + cargo.datas.append(contentvectordata) + + # extract configs + for config_path in configs_path: + cp = ConfigParser.ConfigParser() + cp.read(config_path) + datasets_items = {} + # extract sections + for section in cp.sections(): + if section in config.HEADINGS.keys(): + # extract section's items + for (item_key, item_value) in cp.items(section): + if item_key in config.HEADINGS.get(section): + handle_recognized_section_item(section, item_key, item_value) + else: + handle_unrecognized_section_item(section, item_key, item_value) + # end extract items + else: + Globals.logger.error("ERROR: skip unrecognized section heading [" + section + "] in config " + config_path) + # handle [datasets] section + if section == "datasets": + handle_dataset_section(datasets_items, config_path) + # end extract sections + # end extract configs + +def print_version(ctx, param, value): + """Print the current version of hermes and exit.""" + if not value: + return + import pkg_resources + version = None + try: + version = pkg_resources.get_distribution("hermes").version + finally: + del pkg_resources + click.echo(version) + ctx.exit() + +def print_data(ctx, param, value): + """Print a list of data currently supported and exit.""" + if not value: + return + click.echo("This option is not yet implemented.") + ctx.exit() + +def print_recommenders(ctx, param, value): + """Print a list of recommender system algorithms currently supported and exit.""" + if not value: + return + click.echo("This option is not yet implemented.") + ctx.exit() + +def print_metrics(ctx, param, value): + """Print a list of metrics currently supported and exit.""" + if not value: + return + click.echo("This option is not yet implemented.") + ctx.exit() + +# TODO: implement print_data, print_recommenders, print_metrics? +@click.command() +@click.option("--version", callback=print_version, is_flag=True, expose_value=False, is_eager=True, \ + help="Display hermes's version number.") +@click.option("--data", callback=print_data, is_flag=True, expose_value=False, is_eager=True, \ + help="Print a list of data currently supported.") +@click.option("--algos", callback=print_recommenders, is_flag=True, expose_value=False, is_eager=True, \ + help="Print a list of recommender system algorithms currently supported.") +@click.option("--metrics", callback=print_metrics, is_flag=True, expose_value=False, is_eager=True, \ + help="Print a list of metrics currently supported.") +@click.option("--verbose", is_flag=True, \ + help="Print debug messages") +@click.option("--hdfs_dir", default="datasets", \ + help="Name of HDFS directory to store input data. Default = datasets.") +# IP address of fs.default.name used in HDFS +@click.argument("fs_default_ip_addr", default="localhost:9000") +@click.argument("list_of_files_config", type=click.Path(exists=True), nargs=1) +@click.argument("configs", type=click.Path(exists=True), nargs=-1) +def main(verbose, hdfs_dir, fs_default_ip_addr, list_of_files_config, configs): + + # initialize global variables + Globals.verbose = verbose + Globals.logger = create_logger("hermes") + Globals.scsingleton = create_sparkcontext() + + # create state machine + state_machine = StateMachine() + add_states(state_machine) + + # create cargo + cargo = Cargo() + + # add items to cargo + cargo.hdfs_dir = hdfs_dir + cargo.fs_default_ip_addr = fs_default_ip_addr + # extract configs and add them to cargo + extract_configs(configs, list_of_files_config, cargo) + + # run state machine + state_machine.run(cargo) + + + diff --git a/hermes/hermesui.py b/hermes/hermesui.py new file mode 100644 index 0000000..3301330 --- /dev/null +++ b/hermes/hermesui.py @@ -0,0 +1,137 @@ +"""Hermes's user interface via the command line""" + +def _ask_user_for_rdd_format(schema_path, schema_names): + """Ask user for the desired RDD format. + Args: + schema_path: the path to the schema file + schema_names: + Returns: List of schema_name's id. + """ + print "How do you want your data to be parsed?" + print "For example: Given the following options" + print "(0) movie_id" + print "(1) rating" + print "(2) timestamp" + print "(3) user_id" + print "if you wanted the data to be parsed in the format of [(user_id, movie_id, rating)]," + print "please type in: 3 0 1\n" + + def _check_schema_ids(schema_ids, num_schema_ids): + + # check if each schema_name_id is in the range of num_schema_ids + for schema_name_id in schema_name_ids: + if schema_name_id not in range(0, num_schema_ids): + print "Option provided is not in range." + return False + + # check that there are no duplicates + if len(schema_name_ids) != len(set(schema_name_ids)): + print "There are duplicates. Please provide no duplicates." + return False + + return True + + + print "For the following given schema %s" % (schema_path) + print "how do you want your data to be parsed? " + for i in range(0, len(schema_names)): + print "(%s) %s" % (i, schema_names[i]) + + while True: + user_input = raw_input("Enter the numbers separated by blank space: ") + try: + schema_name_ids = [int(schema_name_id.strip()) for schema_name_id in user_input.split(" ")] + if _check_schema_ids(schema_name_ids, len(schema_names)): + break + except ValueError: + print "Please provide a valid number." + + return schema_name_ids + +def _ask_user_for_split_percentage(datum_json_path): + """Ask user what percentage to split the data into training, test, and validation. + Args: + datum_json_path: the path to the data JSON file + Returns: Tuple of percentage of training, test, and validation respectively in float notation. + (trainingPercentage, testPercentage, validationPercentage), seed + """ + print "How do you want to split your data?" + print "For example: If you wanted to split the data into " + print "60% training, 40% test, 0% validation, seed = 11, please type in:" + print "Percentage for training: 60" + print "Percentage for test: 40" + print "Percentage for validation: 0" + print "Seed: 11\n" + + + def _check_percentage(percentage): + """Check if the percentage is valid. + """ + if percentage in range(0, 100): + return True + else: + return False + + def _check_sum_percentage(a, b, c): + """Check if the sum of the given percentages is equal to 100. + """ + sum_percentage = a + b + c + if sum_percentage == 100: + return True + else: + return False + + print "For the following given data %s" % (datum_json_path) + print "how do you want to split your data?" + while True: + while True: + try: + trainingPercentage = int(raw_input("Percentage for training: ").strip()) + except ValueError: + print "Please provide a valid number." + else: + if _check_percentage(trainingPercentage): + break + else: + print "Please provide a number from 0 - 100." + while True: + try: + testPercentage = int(raw_input("Percentage for test: ").strip()) + except ValueError: + print "Please provide a valid number." + else: + if _check_percentage(testPercentage): + break + else: + print "Please provide a number from 0 - 100." + while True: + try: + validationPercentage = int(raw_input("Percentage for validation: ").strip()) + except ValueError: + print "Please provide a valid number." + else: + if _check_percentage(validationPercentage): + break + else: + print "Please provide a number from 0 - 100." + if _check_sum_percentage(trainingPercentage, testPercentage, validationPercentage): + break + else: + print "Sum of percentages does not equal to 100. Please re-input the percentages." + + while True: + try: + seed = int(raw_input("Seed: ").strip()) + break + except ValueError: + print "Please provide a valid number." + + # convert it to a percentage from 0 - 1 + trainingPercentage = trainingPercentage/100. + testPercentage = testPercentage/100. + validationPercentage = validationPercentage/100. + + return [trainingPercentage, testPercentage, validationPercentage], seed + + + diff --git a/hermes/modules/__init__.py b/hermes/modules/__init__.py new file mode 100644 index 0000000..dee4efd --- /dev/null +++ b/hermes/modules/__init__.py @@ -0,0 +1,18 @@ +import cargo +import config +import data +import helper +import hermesglobals +import metricgenerator +import recommendergenerator +import singleton +import statemachine +import timer +import vectorgenerator +import vg.movielens_vectorgenerator +import vg.wiki_vectorgenerator +import rg.default_usecase +import rg.interface +import rg.with_tfidf_usecase +import rg.without_tfidf_usecase +import mg \ No newline at end of file diff --git a/hermes/modules/cargo.py b/hermes/modules/cargo.py new file mode 100644 index 0000000..adc6330 --- /dev/null +++ b/hermes/modules/cargo.py @@ -0,0 +1,30 @@ +class Cargo(object): + """Cargo is the object passed around in the state machine. + It encapsulates all the parameters needed for each state in one object. + + * hdfs_dir: Name of HDFS directory to store input data. + One of the option passed in when running hermes binary. + Default = datasets. + * fs_default_ip_addr: IP address of fs.default.name used in HDFS. + One of the arguments passed in when running hermes binary. + Default = localhost:9000. + * datas: List of Data objects initialized when extracting the configuration file. + * vectors: List of Vector objects initialized during one of the states in the state machine, json_to_rdd_state. + * support_files: Unrecognized items in [datasets] section of the configuration file that is presumed to be support files for the creation of a Vector. + * recommenders: List of recommender system algorithms initialized when extracting the configuration file. + * metrics: List of metrics initialized when extracting the configuration file. + * error_msg: It starts out as an empty string that will be initialized as an error message to the error state. + """ + # TODO: implement cargo as object pool model? + def __init__(self): + self.hdfs_dir = None + self.fs_default_ip_addr = None + self.datas = [] # used until json_to_rdd_state + self.vectors = [] # used until develop_model_state + self.support_files = {} + # TODO: clean up so that there is only recommenders...and not user_recommenders & content_recommenders + self.user_recommenders = [] + self.content_recommenders = [] + self.metrics = [] + self.error_msg = "" + diff --git a/hermes/modules/config.py b/hermes/modules/config.py new file mode 100644 index 0000000..eeaf6c5 --- /dev/null +++ b/hermes/modules/config.py @@ -0,0 +1,46 @@ +# recognized sections and their items +# 1. datasets section +REQ_UV_HEADINGS = ("user_vector_data", "user_vector_transformations") +OPT_UV_HEADINGS = ("user_vector_schemas",) +UV_HEADINGS = () + REQ_UV_HEADINGS + OPT_UV_HEADINGS + +REQ_CV_HEADINGS = ("content_vector_data", "content_vector_transformations") +OPT_CV_HEADINGS = ("content_vector_schemas",) +CV_HEADINGS = () + REQ_CV_HEADINGS + OPT_CV_HEADINGS + +DATASETS_HEADINGS = ("dataname",) + UV_HEADINGS + CV_HEADINGS + +# 2. recommenders section +RECOMMENDERS_HEADINGS = ("user_recommenders", "content_recommenders") + +# 3. metrics section +METRICS_HEADINGS = ("metrics") + +HEADINGS = { "datasets": DATASETS_HEADINGS, \ + "recommenders": RECOMMENDERS_HEADINGS, \ + "metrics": METRICS_HEADINGS \ + } + +def map_section(config_parser, section): + """ Map a section with the given section name and return a dictionary of the section. + + Args: + config_parser: config parser of the configuration file + section: section name to map + + Returns: + section_dict: a dictionary of the section. + Use section_dict to obtain the value of the item provided that you know the item name, ie. section_dict[item_name]. + """ + + section_dict = {} + options = config_parser.options(section) + for option in options: + try: + section_dict[option] = config_parser.get(section, option) + if section_dict[option] == -1: + Globals.logger.debug(__name__ + ": map_section(): skipping option " + option) + except: + Globals.logger.error(__name__ + ": map_section(): exception on option " + option) + section_dict[option] = None + return section_dict diff --git a/hermes/modules/data.py b/hermes/modules/data.py new file mode 100644 index 0000000..092e452 --- /dev/null +++ b/hermes/modules/data.py @@ -0,0 +1,43 @@ +import helper +from hermesglobals import Globals + +# TODO: a better way of storing configuration from configuration file? +class Data(object): + """ Store configuration from configuration files. """ + + def __init__(self, datapath, vector_transformation, schemapath, dataname): + #if not helper.is_filepath_valid(datapath): + # raise OSError + self.datapath = datapath + self.dataname = dataname + self.vector_transformation = vector_transformation + self.schema = helper.get_schema(schemapath) + self.dataframe = None + # TODO: do we need to know from which config the data is from? + + def set_dataframe(self, sc, sqlCtx, datapath_in_hdfs): + self.dataframe = sqlCtx.read.json(datapath_in_hdfs, self.schema) + # explicitly repartition RDD after loading so that more tasks can run on it in parallel + # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster + # TODO: a better way to go about the dataframe repartition? + self.dataframe = self.dataframe.repartition(sc.defaultParallelism * 3) + + # set schema if it is not already set + if self.schema is None: + self.schema = self.dataframe.schema + +class UserVectorData(Data): + def __init__(self, datapath, vector_transformation, schemapath, dataname): + super(self.__class__, self).__init__(datapath, vector_transformation, schemapath, dataname) + self.which_vector = Globals.constants.USERVECTOR + +class ContentVectorData(Data): + def __init__(self, datapath, vector_transformation, schemapath, dataname, uservectordata): + super(self.__class__, self).__init__(datapath, vector_transformation, schemapath, dataname) + self.which_vector = Globals.constants.CONTENTVECTOR + self.uservectordata = uservectordata + + + + + diff --git a/hermes/modules/helper.py b/hermes/modules/helper.py new file mode 100644 index 0000000..bf0f76f --- /dev/null +++ b/hermes/modules/helper.py @@ -0,0 +1,124 @@ +"""Global helper functions""" + +import imp +import importlib +import inspect +import json +import md5 +import os +import traceback +import zipfile +import zipimport + +from pyspark.sql.types import StructType + +from hermesglobals import Globals + +def is_filepath_valid(filepath): + return True if os.path.isfile(filepath) else False + +def get_schema(schema_path): + if not schema_path: + return None + with open(schema_path, "r") as schema_file: + return StructType.fromJson(json.load(schema_file)) + +def load_modules_in_zip(zipfile_path, which_dir): + try: + try: + zh = zipfile.ZipFile(zipfile_path) + zi = zipimport.zipimporter(zipfile_path) + for name in zh.namelist(): + if os.path.basename(os.path.dirname(name)) == which_dir: + module = zi.load_module(os.path.splitext(name)[0]) + yield module + finally: + try: zh.close() + except: pass + except Exception as err: + Globals.logger.error(err, exc_info=True) + raise + + +def load_modules_in_dir(dir_path): + try: + try: + for root, dirs, files in os.walk(dir_path): + for filename in files: + if filename.endswith(".py"): + # current_file == module + thisfilepath = os.path.join(root, filename) + thisfile = open(thisfilepath, "rb") + # use md5.new to generate unique module identifier + # in case there are two modules of the same name + # assumption: no subdirectory within dir_path + module = imp.load_source(md5.new(thisfilepath).hexdigest(), thisfilepath, thisfile) + yield module + thisfile.close() + finally: + try: thisfile.close() + except: pass + except ImportError as err: + Globals.logger.error(err, exc_info=True) + raise + except Exception as err: + Globals.logger.error(err, exc_info=True) + raise + +# check whether checkcls is the cls or direct subclass of cls +def is_direct_subclass(obj, cls): + # 1. make sure that checkcls is a class object + checkcls = obj + if not inspect.isclass(obj): + checkcls = obj.__class__ + # 2. check if checkcls == cls; if it is, return True + # 3. check if cls is a direct parent of checkcls + return type(checkcls) == type(cls) or cls in checkcls.__bases__ + +# check whether checkcls it the cls or non-direct subclass of cls +def is_non_direct_subclass(checkcls, cls): + # 1. make sure that checkcls is a class object + checkcls = obj + if not inspect.isclass(obj): + checkcls = obj.__class__ + # 2. check if checkcls == cls; if it is, return True + # 3. check if checkcls ia subclass of cls + return type(checkcls) == type(cls) or issubclass(checkcls, cls) + + +# return generator of direct descendants +def get_direct_subclasses(module, cls): + try: + for name, obj in inspect.getmembers(module): + # 1. check that obj is a class + if inspect.isclass(obj): + # 2. check that obj is a direct descendant of class + if cls in obj.__bases__: + yield obj + else: + # WARNING: assumption that there is only one class of the same name in all of the modules + for objparent in obj.__bases__: + if objparent.__name__ == cls.__name__: + yield obj + except Exception as err: + Globals.logger.error(err, exc_info=True) + +# return generator of descendants including non-direct ones +def get_non_direct_subclasses(module, cls): + try: + for name, obj in inspect.getmembers(module): + # 1. check that obj is a class + if inspect.isclass(obj): + # 2. check that obj is a direct descendant of class + if issubclass(obj, cls): + yield obj + else: + # WARNING: assumption that there is only one class of the same name in all of the modules + for objparent in obj.__bases__: + if objparent.__name__ == cls.__name__: + yield obj + except Exception as err: + Globals.logger.error(err, exc_info=True) + + + diff --git a/hermes/modules/hermesglobals.py b/hermes/modules/hermesglobals.py new file mode 100644 index 0000000..4cf5115 --- /dev/null +++ b/hermes/modules/hermesglobals.py @@ -0,0 +1,36 @@ +import os + +class Globals(object): + """Globals contains global variables shared by all files. + + Args: + verbose: a boolean variable that prints out debug log messages + logger: logging object that logs messages + scsingleton: Spark Context. There can only be one scsingleton running. + DIR_VECTORS_PATH: a constant string that refers to the directory where vectorgenerators for specific datasets are resided + DIR_RECOMMENDERS_PATH: a constant string that refers to the directory where recommendergenerators for specific recommenders are resided + DIR_METRICS_PATH: a constant string that refers to the directory where metricgenerators for specific metrics are resided + """ + + class Constants(object): + def __init__(self): + self.USERVECTOR = "UserVector" + self.CONTENTVECTOR = "ContentVector" + self.ROOT_PATH = os.path.dirname(os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) + self.DIR_VECTORS_NAME = "vg" + self.DIR_VECTORS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + self.DIR_VECTORS_NAME + self.DIR_RECOMMENDERS_NAME = "rg" + self.DIR_RECOMMENDERS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + self.DIR_RECOMMENDERS_NAME + self.DIR_METRICS_NAME = "mg" + self.DIR_METRICS_PATH = os.path.dirname(os.path.realpath(__file__)) + "/" + self.DIR_METRICS_NAME + + def __setattr__(self, attr, value): + if hasattr(self, attr): + print("ERROR: cannot reset a constant variable %s = %s" % (attr, value)) + else: + self.__dict__[attr] = value + + verbose = False + logger = None + scsingleton = None + constants = Constants() diff --git a/hermes/modules/metricgenerator.py b/hermes/modules/metricgenerator.py new file mode 100644 index 0000000..90f08bc --- /dev/null +++ b/hermes/modules/metricgenerator.py @@ -0,0 +1,69 @@ + +import os +import sys +sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/..")) +import algorithms.performance_metrics as pm + +""" + +eggsecutor = MetricExecutor(RMSE()) +print eggsecutor.execute(vector) +eggsecutor.change_metric(PRFS()) +print eggsecutor.execute(vector) + +""" + +# ================================================================================ +# Metric Strategy +# ================================================================================ + +class MetricExecutor: + def __init__(self, metric): + self.metric = metric + + def execute(self, vector): + return self.metric.calculate_metric(vector) + + def change_metric(self, new_metric): + print "changing metric to %s" % new_metric + self.metric = new_metric + +# ================================================================================ +# List of metrics +# ================================================================================ + +class MetricFactory(object): + def create_obj_metric(self, metric_str): + which_metric = getattr(sys.modules[__name__], metric_str) + print "which_metric: ", which_metric + if not which_metric: + # cannot find class + raise ValueError + else: + print "calling on which_metric()" + return which_metric() + +class Metric: + def calculate_metric(self, vector=None) : + raise NotImplemented + +class RMSE(Metric): + def calculate_metric(self, vector): + print "executing RMSE" + print vector.test_vector.take(5) + print vector.prediction_vector.take(5) + return pm.calculate_rmse(vector.test_vector, vector.prediction_vector) + +class MAE(Metric): + def calculate_metric(self, vector): + print "executing MAE" + print vector.test_vector.take(5) + print vector.prediction_vector.take(5) + return pm.calculate_mae(vector.test_vector, vector.prediction_vector) + +class PRFS(Metric): + pass + + + + diff --git a/hermes/modules/mg/__init__.py b/hermes/modules/mg/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hermes/modules/recommendergenerator.py b/hermes/modules/recommendergenerator.py new file mode 100644 index 0000000..d441217 --- /dev/null +++ b/hermes/modules/recommendergenerator.py @@ -0,0 +1,74 @@ +""" + +with_tfidf = WithTfidf() +without_tfidf = WithoutTfidf() + +recommender = ALS(with_tfidf) +recommender.make_prediction() + +recommender = ALS(without_tfdif) # same as: recommender = ALS() +recommender.make_prediction() + +recommender = CBWithKMeans(with_tfidf) +recommender.make_prediction() + +recommender = CBWithKMeans(without_tfidf) # same as: recommender = CBWithKMeans +recommender.make_prediction() + +""" + +import sys +import timer + +import helper + +from hermesglobals import Globals +from rg.default_usecase import Default + +# ================================================================================ +# Bridge: bridge target interface & background implementation +# ================================================================================ + +class Recommender(object): + def __init__(self, user_vector, content_vector=None, implementation=Default()): + self.user_vector = user_vector + self.content_vector = content_vector + self.implementation = implementation + + def make_prediction(self): + # target interface + raise NotImplemented + +# ================================================================================ +# Recommender Factory +# ================================================================================ + +class RecommenderFactory(object): + def create_obj_recommender(self, recommender_str, user_vector, content_vector=None, implementation=Default()): + which_recommender = getattr(sys.modules[__name__], recommender_str) + if not which_recommender: + # cannot find class + raise ValueError + else: + return which_recommender(user_vector, content_vector, implementation) + + +# ================================================================================ +# Variant of target interface +# ================================================================================ + +class ALS(Recommender): + def make_prediction(self): + return self.implementation.make_prediction_with_als(self.user_vector, self.content_vector) + +class CBWithKMeans(Recommender): + def make_prediction(self): + return self.implementation.make_prediction_with_cbkmeans(self.user_vector, self.content_vector) + +class UserUser(Recommender): + def make_prediction(self): + return self.implementation.make_prediction_with_useruser(self.user_vector, self.content_vector) + +class ItemItem(Recommender): + def make_prediction(self): + return self.implementation.make_prediction_with_itemitem(self.user_vector, self.content_vector) \ No newline at end of file diff --git a/hermes/modules/rg/__init__.py b/hermes/modules/rg/__init__.py new file mode 100644 index 0000000..ae4a8ac --- /dev/null +++ b/hermes/modules/rg/__init__.py @@ -0,0 +1,4 @@ +import default_usecase +import interface +import with_tfidf_usecase +import without_tfidf_usecase \ No newline at end of file diff --git a/hermes/modules/rg/default_usecase.py b/hermes/modules/rg/default_usecase.py new file mode 100644 index 0000000..d5e48d8 --- /dev/null +++ b/hermes/modules/rg/default_usecase.py @@ -0,0 +1,28 @@ +from interface import ImplementationInterface + +import hermes.algorithms.cf as cf + +# ================================================================================ +# Concrete background implementations for default use cases +# ================================================================================ + +class Default(ImplementationInterface): + def make_prediction_with_als(self, user_vector, content_vector): + user_vector.prediction_vector = cf.calc_cf_mllib(user_vector.training_vector) + return user_vector.prediction_vector + + """ + # TODO: specify rank based on what the user wants + import pyspark.mllib.recommendation as mllib + model = mllib.ALS.train(vector.training_vector, rank=3) + prediction_vector = model.predictAll( vector.test_vector.map( lambda x: (x[0], x[1]) ) ).cache() + return prediction_vector + """ + + def make_prediction_with_useruser(self, user_vector, content_vector): + user_vector.prediction_vector = cf.calc_user_user_cf2(user_vector.training_vector) + return user_vector.prediction_vector + + def make_prediction_with_itemitem(self, user_vector, content_vector): + user_vector.prediction_vector = cf.calc_item_item_cf(user_vector.training_vector) + return user_vector.prediction_vector diff --git a/hermes/modules/rg/interface.py b/hermes/modules/rg/interface.py new file mode 100644 index 0000000..841e7f9 --- /dev/null +++ b/hermes/modules/rg/interface.py @@ -0,0 +1,16 @@ +# ================================================================================ +# Background implementation interface +# ================================================================================ + +class ImplementationInterface(object): + def make_prediction_with_als(self): + raise NotImplemented + + def make_prediction_with_cbkmeans(self): + raise NotImplemented + + def make_prediction_with_useruser(self): + raise NotImplemented + + def make_prediction_with_itemitem(self): + raise NotImplemented \ No newline at end of file diff --git a/hermes/modules/rg/second_usecase.py b/hermes/modules/rg/second_usecase.py new file mode 100644 index 0000000..99d76de --- /dev/null +++ b/hermes/modules/rg/second_usecase.py @@ -0,0 +1,14 @@ +from interface import ImplementationInterface + +import hermes.algorithms.cf as cf + +# ================================================================================ +# Concrete background implementations for default use cases +# ================================================================================ + +class SecondUseCase(ImplementationInterface): + + def make_prediction_with_useruser(self, user_vector, content_vector): + user_vector.prediction_vector = cf.calc_user_user_cf(user_vector.training_vector) + return user_vector.prediction_vector + diff --git a/hermes/modules/rg/with_tfidf_usecase.py b/hermes/modules/rg/with_tfidf_usecase.py new file mode 100644 index 0000000..d051958 --- /dev/null +++ b/hermes/modules/rg/with_tfidf_usecase.py @@ -0,0 +1,8 @@ +from interface import ImplementationInterface + +# ================================================================================ +# Concrete background implementations for use cases with tf-idf +# ================================================================================ + +class WithTfidf(ImplementationInterface): + pass \ No newline at end of file diff --git a/hermes/modules/rg/without_tfidf_usecase.py b/hermes/modules/rg/without_tfidf_usecase.py new file mode 100644 index 0000000..2b3ebb1 --- /dev/null +++ b/hermes/modules/rg/without_tfidf_usecase.py @@ -0,0 +1,8 @@ +from interface import ImplementationInterface + +# ================================================================================ +# Concrete background implementations for use cases without tf-idf +# ================================================================================ + +class WithoutTfidf(ImplementationInterface): + pass \ No newline at end of file diff --git a/src/examples/singleton.py b/hermes/modules/singleton.py similarity index 87% rename from src/examples/singleton.py rename to hermes/modules/singleton.py index c327406..767e142 100644 --- a/src/examples/singleton.py +++ b/hermes/modules/singleton.py @@ -4,6 +4,8 @@ from pyspark.sql import SQLContext class SCSingleton(object): + """ Wrapper for Spark Context to prevent multiple instantiation of the Spark Context. """ + __instance = None def __new__(cls, conf): diff --git a/hermes/modules/statemachine.py b/hermes/modules/statemachine.py new file mode 100644 index 0000000..06a4ada --- /dev/null +++ b/hermes/modules/statemachine.py @@ -0,0 +1,59 @@ +class InitializationError(Exception): + def __init__(self, value): + self.value = value + + def __str__(self): + return repr(self.value) + +class StateMachine: + """ + To emulate a state machine. + + Example: + # state1 -> state2 -> state3a + -> state3b + # where state1, state2, state3a, and state3b are defined functions. + + import StateMachine + sm = StateMachine() + sm.add_state(state1) + sm.add_state(state2) + sm.add_state(state3a, isEndState=True) + sm.add_state(state3b, isEndState=True) + sm.set_start(state1) + sm.run() + """ + + def __init__(self): + self.handlers = [] + self.startState = None + self.endStates = [] + + def add_state(self, handler, isEndState=False): + self.handlers.append(handler) + if isEndState: + self.endStates.append(handler) + + def set_start(self, handler): + self.startState = handler + + def run(self, cargo=None): + if not self.startState: + raise InitializationError("Must call .set_start() before .run()") + if not self.endStates: + raise InitializationError("Must call .set_start() before .run()") + + handler = self.startState + + while True: + (newState, cargo) = handler(cargo) + if newState in self.endStates: + newState(cargo) + break + elif newState not in self.handlers: + print self.handlers + raise RuntimeError("Invalid state %s" % newState) + else: + handler = newState + + return self \ No newline at end of file diff --git a/hermes/modules/timer.py b/hermes/modules/timer.py new file mode 100644 index 0000000..5927d98 --- /dev/null +++ b/hermes/modules/timer.py @@ -0,0 +1,22 @@ +import time + +class Timer(object): + """ + To time how long a particular function runs. + + Example: + import Timer + with Timer() as t: + somefunction() + print("somefunction() takes %s seconds" % t.secs) + print("somefunction() takes %s milliseconds" % t.msecs) + """ + + def __enter__(self): + self.start = time.time() + return self + + def __exit__(self, *args): + self.end = time.time() + self.secs = self.end - self.start + self.msecs = self.secs * 1000 \ No newline at end of file diff --git a/hermes/modules/vectorgenerator.py b/hermes/modules/vectorgenerator.py new file mode 100644 index 0000000..8bf1450 --- /dev/null +++ b/hermes/modules/vectorgenerator.py @@ -0,0 +1,112 @@ + +# vector generator == rdd generator + +import helper +from hermesglobals import Globals + +# ================================================================================ +# Vector Factory +# ================================================================================ + +class VectorFactory(object): + def create_vector(self, data, support_files, runs_from_notebook=False): + # select which vector to create + vector = None + if data.which_vector == Globals.constants.USERVECTOR: + vector = UserVector + elif data.which_vector == Globals.constants.CONTENTVECTOR: + vector = ContentVector + else: + raise Exception + # select if we are loading modules from a directory or a zip + generator = None + if runs_from_notebook: + generator = helper.load_modules_in_zip(Globals.constants.ROOT_PATH, Globals.constants.DIR_VECTORS_NAME) + else: + generator = helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH) + # get subclasses that inherit from either UserVector or ContentVector + # from modules in hermes/hermes/modules/vectors directory + for module in generator: + for subclass in helper.get_direct_subclasses(module, vector): + if subclass.isSameDataInstance(data): + return subclass(data, support_files).vector + else: + # cannot find class that builds the data + raise ValueError + + def create_obj_vector(self, data, support_files, runs_from_notebook=False): + # select which vector to create + vector = None + if data.which_vector == Globals.constants.USERVECTOR: + vector = UserVector + elif data.which_vector == Globals.constants.CONTENTVECTOR: + vector = ContentVector + else: + raise Exception + # select if we are loading modules from a directory or a zip + generator = None + if runs_from_notebook: + generator = helper.load_modules_in_zip(Globals.constants.ROOT_PATH, Globals.constants.DIR_VECTORS_NAME) + else: + generator = helper.load_modules_in_dir(Globals.constants.DIR_VECTORS_PATH) + # get subclasses that inherit from either UserVector or ContentVector + # from modules in hermes/hermes/modules/vectors directory + for module in generator: + for subclass in helper.get_direct_subclasses(module, vector): + if subclass.isSameDataInstance(data): + return subclass(data, support_files) + else: + # cannot find class that builds the data + raise ValueError + +# ================================================================================ +# Vector Factory Objects +# ================================================================================ + +class Vector(object): + def __init__(self, data, support_files): + self.data = data + self.support_files = support_files + vector_transformation = getattr(self, data.vector_transformation) + if not vector_transformation: + self.vector = None + else: + self.vector = vector_transformation() + + def split_data(self, weights, seed): + raise NotImplemented + +# ================================================================================ +# User Vector and Content Vector Factory Objects +# ================================================================================ + +class UserVector(Vector): + def __init__(self, data, support_files): + super(UserVector, self).__init__(data, support_files) + self.training_vector = None + self.test_vector = None + self.validation_vector = None + self.prediction_vector = None + + def split_data(self, weights, seed): + training_vector, test_vector, validation_vector = self.vector.randomSplit(weights, seed) + self.training_vector = training_vector + self.test_vector = test_vector + self.validation_vector = validation_vector + +class ContentVector(Vector): + def __init__(self, data, support_files, uservector=None, runs_from_notebook=False): + super(ContentVector, self).__init__(data, support_files) + # TODO: terrible, quick fix -> fix it for real in the future + if uservector is not None: + self.uservector = uservector + else: + self.uservector = VectorFactory().create_obj_vector(self.data.uservectordata, support_files, runs_from_notebook) + + + +# ================================================================================ +# User Vector and Content Vector for specific datasetes +# defined in hermes/hermes/modules/vectors +# ================================================================================ + diff --git a/hermes/modules/vg/__init__.py b/hermes/modules/vg/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hermes/modules/vg/movielens_vectorgenerator.py b/hermes/modules/vg/movielens_vectorgenerator.py new file mode 100644 index 0000000..e4a12f6 --- /dev/null +++ b/hermes/modules/vg/movielens_vectorgenerator.py @@ -0,0 +1,47 @@ +from modules.vectorgenerator import UserVector, ContentVector +from modules.hermesglobals import Globals + +# ================================================================================ +# MovieLens +# ================================================================================ + +# TODO: do we need isSameDataInstance()? can we eliminate it? +class MovieLens(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.dataname == "movielens" + +class MovieLensUserVector(UserVector, MovieLens): + def ratings(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)) + + def pos_ratings(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, row.rating)).filter(lambda (u, m, r): r > 3) + + def ratings_to_interact(self): + return self.data.dataframe.map(lambda row: (row.user_id, row.movie_id, -1 if row.rating < 3 else 1)) + +class MovieLensContentVector(ContentVector, MovieLens): + def genre(self): + def get_genre(row): + return np.array(( + int(row.genre_action), + int(row.genre_adventure), + int(row.genre_animation), + int(row.genre_childrens), + int(row.genre_comedy), + int(row.genre_crime), + int(row.genre_documentary), + int(row.genre_drama), + int(row.genre_fantasy), + int(row.genre_filmnoir), + int(row.genre_horror), + int(row.genre_musical), + int(row.genre_mystery), + int(row.genre_romance), + int(row.genre_scifi), + int(row.genre_thriller), + int(row.genre_war), + int(row.genre_western), + )) + return self.data.dataframe.map(lambda row: (row.movie_id, get_genre(row))) diff --git a/hermes/modules/vg/wiki_vectorgenerator.py b/hermes/modules/vg/wiki_vectorgenerator.py new file mode 100644 index 0000000..48ab958 --- /dev/null +++ b/hermes/modules/vg/wiki_vectorgenerator.py @@ -0,0 +1,39 @@ +from modules.vectorgenerator import UserVector, ContentVector +from modules.hermesglobals import Globals + +# ================================================================================ +# Wiki +# ================================================================================ + +class Wiki(object): + @classmethod + def isSameDataInstance(cls, comparisonData): + return comparisonData.dataname == "wiki" + +class WikiUserVector(UserVector, Wiki): + def __init__(self): + super(self.__class__, self).__init__() + self.filtered = Globals.scsingleton.sqlCtx.sql("select * from ratings where redirect_target is null and article_namespace=0 and user_id is not null") + self.filtered.registerTempTable("wiki_ratings") + + def num_edits(self): + return Globals.scsingleton.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id") + + def any_interact(self): + return Globals.scsingleton.sqlCtx.sql("select user_id as user, article_id as item, 1 as rating from wiki_ratings group by user_id, article_id") + + def num_edits_ceil(self): + return Globals.scsingleton.sqlCtx.sql("select user_id as user, article_id as item, count(1) as rating from wiki_ratings group by user_id, article_id")\ + .map(lambda (user, article, rating): (user, article, max(rating, 5))) + +class WikiContentVector(ContentVector, Wiki): + def __init__(self): + super(self.__class__, self).__init__() + self.filtered_content = Globals.scsingleton.sqlCtx.sql("select * from content where redirect_target is null and article_namespace=0 and full_text is not null") + self.filtered_content.registerTempTable("wiki_content") + + def glove(self): + raise NotImplemented + + def category_map(self): + raise NotImplemented \ No newline at end of file diff --git a/hermes/utils/__init__.py b/hermes/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/article_to_category.py b/hermes/utils/article_to_category.py similarity index 100% rename from src/utils/article_to_category.py rename to hermes/utils/article_to_category.py diff --git a/src/utils/book_crossing_etl/README.md b/hermes/utils/book_crossing_etl/README.md similarity index 100% rename from src/utils/book_crossing_etl/README.md rename to hermes/utils/book_crossing_etl/README.md diff --git a/src/utils/book_crossing_etl/bookcrossing.py b/hermes/utils/book_crossing_etl/bookcrossing.py similarity index 100% rename from src/utils/book_crossing_etl/bookcrossing.py rename to hermes/utils/book_crossing_etl/bookcrossing.py diff --git a/src/utils/clean_categories.py b/hermes/utils/clean_categories.py similarity index 100% rename from src/utils/clean_categories.py rename to hermes/utils/clean_categories.py diff --git a/src/utils/clean_links.py b/hermes/utils/clean_links.py similarity index 100% rename from src/utils/clean_links.py rename to hermes/utils/clean_links.py diff --git a/src/utils/code_etl/blame_to_json.py b/hermes/utils/code_etl/blame_to_json.py similarity index 96% rename from src/utils/code_etl/blame_to_json.py rename to hermes/utils/code_etl/blame_to_json.py index b7efa60..74aefd1 100755 --- a/src/utils/code_etl/blame_to_json.py +++ b/hermes/utils/code_etl/blame_to_json.py @@ -114,14 +114,17 @@ def clean_email(email): return email.strip("<>") # Function to convert timezone to hour office integer def tz_int(tz): return int(tz, 10) + # Function to clean out non-ascii characters + def clean_text(text): return ''.join([i if ord(i) < 128 else '' for i in text]) + # Translation from the porcelain key to the key in our JSON object, as well # as an option transformation to apply first porcelain_to_json = { - "author": ("author", None), + "author": ("author", clean_text), "author-mail": ("author_mail", clean_email), "author-time": ("author_time", int), "author-tz": ("author_timezone", tz_int), - "committer": ("committer", None), + "committer": ("committer", clean_text), "committer-mail": ("committer_mail", clean_email), "committer-time": ("committer_time", int), "committer-tz": ("committer_timezone", tz_int), diff --git a/src/utils/code_etl/cd.py b/hermes/utils/code_etl/cd.py similarity index 100% rename from src/utils/code_etl/cd.py rename to hermes/utils/code_etl/cd.py diff --git a/src/utils/code_etl/git_manager.py b/hermes/utils/code_etl/git_manager.py similarity index 100% rename from src/utils/code_etl/git_manager.py rename to hermes/utils/code_etl/git_manager.py diff --git a/src/utils/code_etl/repo_to_json.py b/hermes/utils/code_etl/repo_to_json.py similarity index 100% rename from src/utils/code_etl/repo_to_json.py rename to hermes/utils/code_etl/repo_to_json.py diff --git a/src/utils/code_etl/user_to_file_mapper.py b/hermes/utils/code_etl/user_to_file_mapper.py similarity index 93% rename from src/utils/code_etl/user_to_file_mapper.py rename to hermes/utils/code_etl/user_to_file_mapper.py index 150a5f0..f013330 100755 --- a/src/utils/code_etl/user_to_file_mapper.py +++ b/hermes/utils/code_etl/user_to_file_mapper.py @@ -142,6 +142,19 @@ def parse_block(block, file_map): file_map[file] = [(name, email)] +def clean_text(text): + """ Remove non-ascii characters from a string. + + Args: + text (str): A string. + + Returns: + str: A string with all characters with ord() >= 128 removed. + + """ + return ''.join([i if ord(i) < 128 else '' for i in text]) + + def file_map_to_json(file_map, repo_name): """Returns a list of JSON objects as strings containing the `git log` information. @@ -160,8 +173,8 @@ def file_map_to_json(file_map, repo_name): for key, count in counter.iteritems(): current_json = deepcopy(JSON_LINE) current_json["repo_name"] = repo_name - current_json["author"] = key[0] - current_json["author_mail"] = key[1] + current_json["author"] = clean_text(key[0]) + current_json["author_mail"] = clean_text(key[1]) current_json["filename"] = file current_json["edit_count"] = count jsons.append(json.dumps(current_json)) diff --git a/src/utils/content_vector_tf_idf.py b/hermes/utils/content_vector_tf_idf.py similarity index 100% rename from src/utils/content_vector_tf_idf.py rename to hermes/utils/content_vector_tf_idf.py diff --git a/src/utils/glove.py b/hermes/utils/glove.py similarity index 100% rename from src/utils/glove.py rename to hermes/utils/glove.py diff --git a/src/utils/jester_etl/README.md b/hermes/utils/jester_etl/README.md similarity index 100% rename from src/utils/jester_etl/README.md rename to hermes/utils/jester_etl/README.md diff --git a/src/utils/jester_etl/jester.py b/hermes/utils/jester_etl/jester.py similarity index 100% rename from src/utils/jester_etl/jester.py rename to hermes/utils/jester_etl/jester.py diff --git a/src/utils/lastfm_etl/README.md b/hermes/utils/lastfm_etl/README.md similarity index 100% rename from src/utils/lastfm_etl/README.md rename to hermes/utils/lastfm_etl/README.md diff --git a/src/utils/lastfm_etl/lastfm.py b/hermes/utils/lastfm_etl/lastfm.py similarity index 100% rename from src/utils/lastfm_etl/lastfm.py rename to hermes/utils/lastfm_etl/lastfm.py diff --git a/src/utils/movielens_etl/ml10m_to_json.py b/hermes/utils/movielens_etl/ml10m_to_json.py similarity index 100% rename from src/utils/movielens_etl/ml10m_to_json.py rename to hermes/utils/movielens_etl/ml10m_to_json.py diff --git a/src/utils/movielens_etl/ml1m_to_json.py b/hermes/utils/movielens_etl/ml1m_to_json.py similarity index 100% rename from src/utils/movielens_etl/ml1m_to_json.py rename to hermes/utils/movielens_etl/ml1m_to_json.py diff --git a/src/utils/movielens_etl/ml20m_to_json.py b/hermes/utils/movielens_etl/ml20m_to_json.py similarity index 100% rename from src/utils/movielens_etl/ml20m_to_json.py rename to hermes/utils/movielens_etl/ml20m_to_json.py diff --git a/src/utils/movielens_etl/movielens.py b/hermes/utils/movielens_etl/movielens.py similarity index 100% rename from src/utils/movielens_etl/movielens.py rename to hermes/utils/movielens_etl/movielens.py diff --git a/src/utils/osm_etl/osm.py b/hermes/utils/osm_etl/osm.py similarity index 52% rename from src/utils/osm_etl/osm.py rename to hermes/utils/osm_etl/osm.py index 0a30798..910b684 100755 --- a/src/utils/osm_etl/osm.py +++ b/hermes/utils/osm_etl/osm.py @@ -63,35 +63,37 @@ #JSON osm objects (node, way, or relation): OSM_OBJECT = { "id": None, + "lat": None, + "lon": None, "timestamp": None, "version": None, "changeset": None, "visible": None, "user": None, "uid": None, - "type": None, #can be node, way or relation - "lat": None, - "lon": None, + "osm_type": None, #can be Node, Way or Relation #add in tags "source": None, "building": None, "highway": None, "name": None, - "addr:city": None, - "addr:postcode": None, + "addr_city": None, + "addr_postcode": None, "natural": None, "landuse": None, - "surfacewaterway": None, + "surface": None, + "waterway": None, "power": None, "wall": None, "oneway": None, "amenity": None, "ref": None, - "building:levels": None, + "building_levels": None, "maxspeed": None, "barrier": None, "type": None, - "placefoot": None, + "place": None, + "foot": None, "bicycle": None, "railway": None, "leisure": None, @@ -127,82 +129,102 @@ ) args = parser.parse_args() + tag_names2 = ['source', 'building', 'highway', 'name', 'addr_city', 'addr_postcode', 'natural', 'landuse', 'surface',\ + 'waterway','power','wall','oneway','amenity','ref', 'building_levels', 'maxspeed','barrier','type','place',\ + 'foot','bicycle','railway','leisure','bridge', 'parking','man_made','railway','aeroway', 'wikipedia'] + changeset_json_file = open( args.output_directory +"/changeset_test.json", 'w') node_json_file = open( args.output_directory +"/node_test.json", 'w') relation_map = open( args.output_directory +"/relation_map.txt", 'w') osm_tree = ET.iterparse(args.osm_history, events=("start", "end")) - for event, elem in osm_tree: - #print event, elem - if event == 'start' and elem.tag=='changeset': - #print elem, elem.attrib - #create a new changeset object - c = deepcopy(CHANGESET) - for key, value in elem.attrib.iteritems(): - c[key] = value #for now all values are strings - #print json.dumps(c) - changeset_json_file.write(json.dumps(c) + "\n") - - elif event == 'start' and elem.tag=='node': - #create a new node object - n = deepcopy(OSM_OBJECT) - for key, value in elem.attrib.iteritems(): - n[key] = value #for now all values are strings - n["type"] = 'Node' - for child in elem: - if child.tag =='tag': - for key, value in child.attrib.iteritems(): - n[key] = value - - elif event=='end' and elem.tag=='node': - node_json_file.write(json.dumps(n) + "\n") - - elif event == 'start' and elem.tag=='way': - #create a new node object of type way - n = deepcopy(OSM_OBJECT) - nid = elem.get('id') - for key, value in elem.attrib.iteritems(): - n[key] = value #for now all values are strings - n["type"] = 'Way' - for child in elem: - if child.tag =='tag': - for key, value in child.attrib.iteritems(): - n[key] = value - elif child.tag =='nd': - ref_id = child.get('ref') - #these are the node references, we will create a file that is w_id, n_id, w_type, n_type - relation_map.write(nid + ',' + ref_id + ', way, node' + "\n") - else: - print child.tag - - elif event=='end' and elem.tag=='way': - #print json.dumps(n) - node_json_file.write(json.dumps(n) + "\n") - - - elif event == 'start' and elem.tag=='relation': - #create a new node object of type way - n = deepcopy(OSM_OBJECT) - nid = elem.get('id') - for key, value in elem.attrib.iteritems(): - n[key] = value #for now all values are strings - n["type"] = 'Relation' - for child in elem: - if child.tag =='tag': - for key, value in child.attrib.iteritems(): - n[key] = value - elif child.tag =='member': - ref_id = child.get('ref') - r_type = child.get('type') - #these are the node references, we will create a file that is w_id, n_id, w_type, n_type - relation_map.write(nid + ',' + ref_id + ', relation, '+ r_type + "\n") - else: - print child.tag - - elif event=='end' and elem.tag=='way': - #print json.dumps(n) - node_json_file.write(json.dumps(n) + "\n") +for event, elem in osm_tree: + if event == 'start' and elem.tag=='changeset': + #print elem, elem.attrib + #create a new changeset object + c = deepcopy(CHANGESET) + for key, value in elem.attrib.iteritems(): + c[key] = value #for now all values are strings + #print json.dumps(c) + changeset_json_file.write(json.dumps(c) + "\n") + + elif event == 'start' and elem.tag=='node': + #create a new node object + n = deepcopy(OSM_OBJECT) + for key, value in elem.attrib.iteritems(): + #print key, value + key = key.replace(":", "_") + n[key] = value #for now all values are strings + n["osm_type"] = 'Node' + for child in elem: + if child.tag =='tag': + k, v = child.attrib.items() + key = k[1] + value = v[1] + key = key.replace(":", "_") + if key in tag_names2: + n[key] = value + #print key, value + + elif event=='end' and elem.tag=='node': + #print json.dumps(n) + node_json_file.write(json.dumps(n) + "\n") + + elif event == 'start' and elem.tag=='way': + #create a new node object of type way + n = deepcopy(OSM_OBJECT) + nid = elem.get('id') + for key, value in elem.attrib.iteritems(): + n[key] = value #for now all values are strings + n["osm_type"] = 'Way' + for child in elem: + if child.tag =='tag': + k, v = child.attrib.items() + key = k[1] + value = v[1] + key = key.replace(":", "_") + if key in tag_names2: + n[key] = value + elif child.tag =='nd': + ref_id = child.get('ref') + #these are the node references, we will create a file that is w_id, n_id, w_type, n_type + relation_map.write(nid + ',' + ref_id + ', way, node' + "\n") + else: + print child.tag + + elif event=='end' and elem.tag=='way': + #print json.dumps(n) + node_json_file.write(json.dumps(n) + "\n") + + + elif event == 'start' and elem.tag=='relation': + #create a new node object of type way + n = deepcopy(OSM_OBJECT) + nid = elem.get('id') + for key, value in elem.attrib.iteritems(): + key.replace(":", "_") + n[key] = value #for now all values are strings + n["osm_type"] = 'Relation' + for child in elem: + if child.tag =='tag': + k, v = child.attrib.items() + key = k[1] + value = v[1] + key = key.replace(":", "_") + if key in tag_names2: + n[key] = value + elif child.tag =='member': + ref_id = child.get('ref') + r_type = child.get('type') + #these are the node references, we will create a file that is w_id, n_id, w_type, n_type + relation_map.write(nid + ',' + ref_id + ', relation, '+ r_type + "\n") + else: + print child.tag + + elif event=='end' and elem.tag=='way': + #print json.dumps(n) + node_json_file.write(json.dumps(n) + "\n") changeset_json_file.close() diff --git a/src/utils/remove_templates.py b/hermes/utils/remove_templates.py similarity index 100% rename from src/utils/remove_templates.py rename to hermes/utils/remove_templates.py diff --git a/src/utils/save_load.py b/hermes/utils/save_load.py similarity index 80% rename from src/utils/save_load.py rename to hermes/utils/save_load.py index ec96385..f395bff 100644 --- a/src/utils/save_load.py +++ b/hermes/utils/save_load.py @@ -76,4 +76,23 @@ def load_content_vector(input_fname): content1 = line[1].strip("[]") content = [float(i) for i in str.split(content1, ' ')] content_vector.append((item, content)) - return content_vector \ No newline at end of file + return content_vector + +def save_uv_to_hadoop(vector, output_name): + vector.map(lambda x: ','.join(map(str,x))).saveAsTextFile(output_name) + +def load_uv_from_hadoop(input_name, sc, num_partitions=20): + uv = sc.textFile(input_name).map(parseText)\ + .repartition(num_partitions) + return uv + +def parseText(row): + row = row.split(',') + return (int(row[0]), int(row[1]), float(row[2])) + +def save_cv_to_hadoop(vector, output_name): + vector.saveAsPickleFile(output_name) + +def load_cv_from_hadoop(input_name,sc, num_partitions=20): + cv = sc.pickleFile(input_name).repartition(num_partitions) + return cv diff --git a/src/utils/wiki_categories.py b/hermes/utils/wiki_categories.py similarity index 100% rename from src/utils/wiki_categories.py rename to hermes/utils/wiki_categories.py diff --git a/src/utils/xml_to_json.py b/hermes/utils/xml_to_json.py similarity index 100% rename from src/utils/xml_to_json.py rename to hermes/utils/xml_to_json.py diff --git a/notebooks/framework_in_a_notebook.ipynb b/notebooks/framework_in_a_notebook.ipynb new file mode 100644 index 0000000..276ae75 --- /dev/null +++ b/notebooks/framework_in_a_notebook.ipynb @@ -0,0 +1,1098 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Framework" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "debug = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Grabbing the \"framework\" branch from GitHub and use the \"hermes\" folder as a library" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Step 1: Install necessary libraries." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import importlib\n", + "import pip\n", + "\n", + "def _install(package):\n", + " pip.main(['install', package])\n", + "\n", + "def _import(package):\n", + " importlib.import_module(package)\n", + " \n", + "def install_and_import(package):\n", + " try:\n", + " _import(package)\n", + " except ImportError:\n", + " _install(package)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting GitPython\n", + " Downloading GitPython-1.0.1.tar.gz (355kB)\n", + "Collecting gitdb>=0.6.4 (from GitPython)\n", + " Downloading gitdb-0.6.4.tar.gz (400kB)\n", + "Collecting smmap>=0.8.5 (from gitdb>=0.6.4->GitPython)\n", + " Downloading smmap-0.9.0.tar.gz\n", + "Building wheels for collected packages: GitPython, gitdb, smmap\n", + " Running setup.py bdist_wheel for GitPython\n", + " Stored in directory: /Users/tiffanyj/Library/Caches/pip/wheels/23/f4/31/1d0570ae6ecccca26eafb087788483f614cd740281fd842660\n", + " Running setup.py bdist_wheel for gitdb\n", + " Stored in directory: /Users/tiffanyj/Library/Caches/pip/wheels/63/1b/54/87cf226ccefad0e5fdc78e3c8c65180ac77ed2a04d1dec3a56\n", + " Running setup.py bdist_wheel for smmap\n", + " Stored in directory: /Users/tiffanyj/Library/Caches/pip/wheels/47/75/63/333cdcb6d3e6e8eb1ec6869564b84f7f1e6a875d87541a0ae9\n", + "Successfully built GitPython gitdb smmap\n", + "Installing collected packages: smmap, gitdb, GitPython\n", + "Successfully installed GitPython-1.0.1 gitdb-0.6.4 smmap-0.9.0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using pip version 7.1.2, however version 8.0.2 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting click\n", + " Downloading click-6.2-py2.py3-none-any.whl (70kB)\n", + "Installing collected packages: click\n", + "Successfully installed click-6.2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "You are using pip version 7.1.2, however version 8.0.2 is available.\n", + "You should consider upgrading via the 'pip install --upgrade pip' command.\n" + ] + } + ], + "source": [ + "install_and_import(\"GitPython\")\n", + "install_and_import(\"click\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Step 2: Create a temporary directory.\n", + "\n", + "Step 3: Git clone the \"framework\" branch from GitHub to the temporary directory.\n", + "\n", + "Step 4: Zip the hermes source files.\n", + "\n", + "Step 5: Add zip to SparkContext.\n", + "\n", + "Step 6: Remove temporary directory once it is no longer needed." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "remote_url = \"https://github.com/tiffanyj41/hermes.git\"\n", + "remote_branch = \"framework\"\n", + "source_dir = \"hermes\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# helper functions\n", + "import os\n", + "import functools\n", + "\n", + "def _list_all_in_dir(dir_path):\n", + " for path, subdirs, files in os.walk(dir_path):\n", + " for filename in files:\n", + " print os.path.join(path, filename)\n", + " \n", + "def _zip_dir(srcdir_path, zipfile_handler):\n", + " try:\n", + " zipfile_handler.writepy(srcdir_path)\n", + " finally:\n", + " zipfile_handler.close()\n", + " \n", + "def trackcalls(func):\n", + " @functools.wraps(func)\n", + " def wrapper(*args, **kwargs):\n", + " wrapper.has_been_called = True\n", + " return func(*args, **kwargs)\n", + " wrapper.has_been_called = False\n", + " return wrapper\n", + "\n", + "@trackcalls\n", + "def _add_zipfile_to_sc(zipfile_path):\n", + " sc.addPyFile(zipfile_path) " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'sc' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mzipfile\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 7\u001b[1;33m \u001b[1;32mprint\u001b[0m \u001b[0msc\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 8\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 9\u001b[0m \u001b[1;31m# create a temporary directory\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mNameError\u001b[0m: name 'sc' is not defined" + ] + } + ], + "source": [ + "import git\n", + "import os\n", + "import tempfile\n", + "import shutil\n", + "import zipfile \n", + "\n", + "# create a temporary directory\n", + "tmpdir_path = tempfile.mkdtemp()\n", + "if debug: print \"temporary directory: %s\\n\" % tmpdir_path\n", + "\n", + "# ensure file is read/write by creator only\n", + "saved_umask = os.umask(0077)\n", + "\n", + "# create a zipfile handler to zip the necessary files\n", + "ziptmpdir_path = tempfile.mkdtemp()\n", + "if debug: print \"temporary directory for zip file: %s\\n\" % ziptmpdir_path\n", + "zipfile_path = ziptmpdir_path + \"/hermes_src.zip\"\n", + "if debug: print \"zip file's path: %s\\n\" % zipfile_path\n", + "zipfile_handler = zipfile.PyZipFile(zipfile_path, \"w\")\n", + "\n", + "# make zipfile handler verbose for debugging\n", + "zipfile_handler.debug = 3\n", + "\n", + "try:\n", + " # clone \"framework\" branch from GitHub into temporary directory\n", + " local_branch = git.Repo.clone_from(remote_url, tmpdir_path, branch=remote_branch)\n", + " if debug: print \"current branch: %s\\n\" % local_branch.head.ref\n", + " if debug: print \"list all in %s:\" % tmpdir_path; _list_all_in_dir(tmpdir_path); print \"\\n\"\n", + " \n", + " # zip \"hermes\" directory\n", + " if debug: print \"zipping: %s\\n\" % os.path.join(tmpdir_path, source_dir)\n", + " _zip_dir(os.path.join(tmpdir_path, source_dir), zipfile_handler)\n", + " \n", + " # check zip file\n", + " if debug: print \"Is zip file %s valid? %s\\n\" % (zipfile_path, zipfile.is_zipfile(zipfile_path))\n", + " \n", + " # add zip to SparkContext \n", + " # note: you can only add zip to SparkContext one time\n", + " if not _add_zipfile_to_sc.has_been_called:\n", + " if debug: print \"add zip file %s into spark context\\n\" % zipfile_path\n", + " _add_zipfile_to_sc(zipfile_path)\n", + " else:\n", + " if debug: print \"zip file %s is already added into spark context; will not re-add\\n\" % zipfile_path\n", + " \n", + "except IOError as e:\n", + " raise e\n", + "else:\n", + " os.remove(zipfile_path)\n", + "finally:\n", + " os.umask(saved_umask)\n", + " shutil.rmtree(tmpdir_path)\n", + " shutil.rmtree(ziptmpdir_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example 1\n", + "* Run movielens_10m_ratings with **ratings** vector transformation\n", + "* Implement **ALS** recommender system algorithms\n", + "* Implement **RMSE, MAE** metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Framework is based on a state machine. Since you are using a notebook, it is unlikely that you will use a state machine to automate the process, but you can use parts of the state machine to do what you need to do." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: __start()\n", + "**For those who use [MovieLens 1M CF test src code](http://l41-srv-mcdh32.b.internal:8880/notebooks/Hermes/MovieLens%201M%20CF%20test%20src%20code.ipynb#) as guidance, this is executing the pre-requisites when the HDFS directory and the input data are not defined yet.**\n", + "\n", + "Function: \n", + "* __start() creates the HDFS directory and uploads the input data. \n", + "* __start() implements the start_state of the state machine.\n", + "\n", + "```bash\n", + "\n", + "def __start(cargo):\n", + " \"\"\"start_state without the state machine.\"\"\"\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"In start_state:\")\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"Creating the hdfs directory \" + cargo.hdfs_dir)\n", + " os.system(\"hdfs dfs -mkdir \" + cargo.hdfs_dir)\n", + "\n", + " def load_json_files(datas):\n", + " for i in range(0, len(datas)):\n", + " json_path = datas[i].datapath\n", + " if Globals.verbose: Globals.logger.debug(\"Loading JSON file \" + json_path + \" into hdfs directory \" + cargo.hdfs_dir)\n", + " os.system(\"hdfs dfs -put \" + json_path + \" \" + cargo.hdfs_dir + \"/\" + os.path.basename(json_path))\n", + "\n", + " load_json_files(cargo.datas)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/tiffanyj/datasets/movielens/movielens_1m_movies.json.gz\n", + "/datasets/movielens/1m/movielens_1m_movies.json.gz\n", + "/home/tiffanyj/datasets/movielens/movielens_1m_ratings.json.gz\n", + "/datasets/movielens/1m/movielens_1m_ratings.json.gz\n" + ] + } + ], + "source": [ + "import os\n", + "hdfs_dir = \"/datasets/movielens/1m\"\n", + "movies_json_path = \"/home/tiffanyj/datasets/movielens/movielens_1m_movies.json.gz\"\n", + "movies_json_path_in_hdfs = hdfs_dir + \"/\" + os.path.basename(movies_json_path)\n", + "ratings_json_path = \"/home/tiffanyj/datasets/movielens/movielens_1m_ratings.json.gz\"\n", + "ratings_json_path_in_hdfs = hdfs_dir + \"/\" + os.path.basename(ratings_json_path)\n", + "\n", + "print movies_json_path\n", + "print movies_json_path_in_hdfs\n", + "print ratings_json_path \n", + "print ratings_json_path_in_hdfs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 1: You implement what is already in __start() manually yourself" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "256" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "# create hdfs_dir \n", + "os.system(\"hdfs dfs -mkdir \" + hdfs_dir)\n", + "# put json located at json_path into hdfs_dir\n", + "os.system(\"hdfs dfs -put \" + ratings_json_path + \" \" + ratings_json_path_in_hdfs)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "ImportError", + "evalue": "No module named hermes", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mImportError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[1;32mfrom\u001b[0m \u001b[0mhermes\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[1;33m*\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mmodules\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;31m# define Data (ie. UserVectorData) which is a class wrapper of the json\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m \u001b[1;31m# and will be used to create a Vector (ie. UserVector)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mImportError\u001b[0m: No module named hermes" + ] + } + ], + "source": [ + "from hermes import *\n", + "import modules.data\n", + "\n", + "# define Data (ie. UserVectorData) which is a class wrapper of the json \n", + "# and will be used to create a Vector (ie. UserVector)\n", + "datapath = ratings_json_path\n", + "vector_transformation = \"ratings\"\n", + "schemapath = None\n", + "dataname = \"movielens\"\n", + "\n", + "uservectordata = modules.data.UserVectorData(datapath, vector_transformation, schemapath, dataname)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 2: You execute using the __start() function" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# import hermes where __start() function is defined\n", + "from hermes import *\n", + "# import cargo where Cargo class is defined\n", + "import modules.cargo\n", + "# import data where configuration is defined\n", + "import modules.data\n", + "\n", + "# create cargo\n", + "cargo = modules.cargo.Cargo()\n", + "\n", + "# add items to cargo\n", + "cargo.hdfs_dir = hdfs_dir\n", + "\n", + "# define Data and put it in cargo\n", + "dataname = \"movielens\"\n", + "datapath = ratings_json_path\n", + "vector_transformation = \"ratings\"\n", + "schemapath = None\n", + "uservectordata = modules.data.UserVectorData(datapath, vector_transformation, schemapath, dataname)\n", + "cargo.datas.append(uservectordata)\n", + "\n", + "# call the start function\n", + "hermes.__start(cargo)\n", + "\n", + "uservectordata = cargo.datas[0]\n", + "uservectordata.cache()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: __json_to_rdd()\n", + "**For those who use [MovieLens 1M CF test src code](http://l41-srv-mcdh32.b.internal:8880/notebooks/Hermes/MovieLens%201M%20CF%20test%20src%20code.ipynb#) as guidance, this is accomplishing cell # 5, 6, 7.**\n", + "\n", + "Function: \n", + "* __json_to_rdd() parses JSON to RDD. \n", + "* __json_to_rdd() implements the json_to_rdd state of the state machine.\n", + "\n", + "```bash\n", + " \"\"\"json_to_rdd_state without the state macine.\"\"\"\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"In json_to_rdd_state:\")\n", + "\n", + " # create RDD for each JSON file and store it in Cargo's vectors list\n", + " for i in range(0, len(cargo.datas)):\n", + " data = cargo.datas[i]\n", + " if Globals.verbose: Globals.logger.debug(\"Working with json file %s\" % data.datapath)\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"Creating dataframe based on the content of the json file\")\n", + " datapath_in_hdfs = \"hdfs://\" + cargo.fs_default_ip_addr + \"/\" + cargo.hdfs_dir + \"/\" + os.path.basename(data.datapath)\n", + " data.set_dataframe(Globals.scsingleton.sc, Globals.scsingleton.sqlCtx, datapath_in_hdfs)\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"Creating RDD based on the computed dataframe and configuration provided by the user\")\n", + " cargo.vectors.append( vg.VectorFactory().create_obj_vector(data, cargo.support_files) ) \n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 1: You implement what is already in __json_to_rdd() manually yourself\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import modules.data\n", + "import modules.vectorgenerator\n", + "\n", + "# convert JSON to Dataframe\n", + "uservectordata.set_dataframe(sc, sqlCtx, ratings_json_path_in_hdfs) \n", + "ratings = uservectordata.dataframe # extracting dataframe variable from UserVectorData class\n", + "\n", + "# this is the same thing as \n", + "# ratings = sqlCtx.read.json(\"hdfs://\" + ratings_json_path_in_hdfs)\n", + "# ratings.repartition(sc.defaultParallelism * 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import modules.vectorgenerator\n", + "import modules.vg\n", + "\n", + "# support_files is a dictionary that you can pass in during vector creation \n", + "support_files = {}\n", + "\n", + "# convert DataFrame to RDD\n", + "mv = modules.vectorgenerator.VectorFactory().create_obj_vector(uservectordata, None, True) \n", + "all_user_ratings = mv.vector\n", + "\n", + "# this is the same thing as \n", + "# mv = movieLens_vectorize.movieLens_vectorize(ratings, None, \"ratings\", \"none\")\n", + "# all_user_ratings = mv.get_user_vector()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print type(all_user_ratings)\n", + "all_user_ratings.take(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 2: You execute using the __json_to_rdd() function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "\n", + "cargo.fs_default_ip_addr = \"\"\n", + "cargo.hdfs_dir = hdfs_dir[1:]\n", + "cargo.support_files = {}\n", + "\n", + "# call json_to_rdd function\n", + "hermes.__json_to_rdd(cargo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mv = cargo.vectors[0]\n", + "all_user_ratings = mv.vector\n", + "print type(all_user_ratings)\n", + "all_user_ratings.take(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: __split_data()\n", + "**For those who use [MovieLens 1M CF test src code](http://l41-srv-mcdh32.b.internal:8880/notebooks/Hermes/MovieLens%201M%20CF%20test%20src%20code.ipynb#) as guidance, this is accomplishing cell # 8, 9.**\n", + "\n", + "Function: \n", + "* __split_data() splits data to train, test, and (optional) validate. \n", + "* __split_data() implements the split_data_state of the state machine.\n", + "\n", + "```bash\n", + "def __split_data(cargo):\n", + " \"\"\"split_data_state without the state machine.\"\"\"\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"In split_data_state:\")\n", + "\n", + " for i in range(0, len(cargo.vectors)):\n", + " vector = cargo.vectors[i]\n", + " weights, seed = hermesui._ask_user_for_split_percentage(vector.data.datapath)\n", + " vector.split_data(weights, seed)\n", + "\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "trainingPercentage = 60/100.\n", + "testPercentage = 40/100.\n", + "validationPercentage = 0/100.\n", + "seed = 11" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 1: You implement what is already in __split_data() manually yourself" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "uservector = mv\n", + "\n", + "uservector.split_data([trainingPercentage, testPercentage, validationPercentage], seed)\n", + "train_ratings = uservector.training_vector\n", + "test_ratings = uservector.test_vector\n", + "validation_ratings = uservector.validation_vector\n", + "\n", + "# this is the same thing as\n", + "# train_ratings, test_ratings = uservector.vector.randomSplit([0.6, 0.4], 11)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "train_ratings.cache()\n", + "test_ratings.cache()\n", + "validation_ratings.cache()\n", + "\n", + "print train_ratings.count(), test_ratings.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 2: you execute using the __split_data() function" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n# TODO: will implement later\\n'" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from hermes import *\n", + "\n", + "# call split_data function\n", + "hermes.__split_data(cargo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mv = cargo.vectors[0]\n", + "train_ratings = mv.training_vector\n", + "test_ratings = mv.test_vector\n", + "validation_ratings = mv.validation_vector\n", + "print train_ratings.count(), test_ratings.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 4: __make_prediction()\n", + "**For those who use [MovieLens 1M CF test src code](http://l41-srv-mcdh32.b.internal:8880/notebooks/Hermes/MovieLens%201M%20CF%20test%20src%20code.ipynb#) as guidance, this is accomplishing cell # 10.**\n", + "\n", + "Function: \n", + "* __make_prediction() develop model based on the train data and make prediction based on this model. \n", + "* __make_prediction() implements the make_prediction_state of the state machine.\n", + "\n", + "```bash\n", + "def __make_prediction(cargo):\n", + " \"\"\"make_prediction_state without the state machine.\"\"\"\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"In make_prediction_state:\") \n", + "\n", + " for i in range(0, len(cargo.vectors)):\n", + " thisvector = cargo.vectors[i]\n", + "\n", + " # select which recommenders based on the vector type\n", + " recommenders = None\n", + " thisvector_uservector = None\n", + " thisvector_contentvector = None\n", + " if helper.is_direct_subclass(thisvector, vg.UserVector):\n", + " if Globals.verbose: Globals.logger.debug(\"Iterating through recommenders for user vector on data %s\", thisvector.data.datapath)\n", + " thisvector_uservector = thisvector\n", + " recommenders = cargo.user_recommenders\n", + " elif helper.is_direct_subclass(thisvector, vg.ContentVector):\n", + " if Globals.verbose: Globals.logger.debug(\"Iterating through recommenders for content vector on data %s\", thisvector.data.datapath)\n", + " thisvector_contentvector = thisvector\n", + " thisvector_uservector = thisvector.uservector\n", + " recommenders = cargo.content_recommenders\n", + "\n", + " # run all recommenders on the vector\n", + " for r in recommenders:\n", + " if Globals.verbose: Globals.logger.debug(\"Making recommendation %s on data %s\", r, thisvector.data.datapath)\n", + " # TODO: implement other use case, ie. WithTfidf(), etc.\n", + " recommender = rg.RecommenderFactory().create_obj_recommender(r, thisvector_uservector, thisvector_contentvector)\n", + " # default use case\n", + " # recommender = RecommenderFactory().create_obj_recommender(r, vector, Default())\n", + " # with tf-idf use case \n", + " # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithTfidf())\n", + " # without tf-idf use case\n", + " # recommender = RecommenderFactory().create_obj_recommender(r, vector, WithoutTfidf())\n", + " # etc.\n", + " with Timer() as t:\n", + " prediction_vector = recommender.make_prediction()\n", + " if Globals.verbose: Globals.logger.debug(\"Making prediction takes %s seconds\" % t.secs)\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 1: You implement what is already in __make_prediciton() manually yourself" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import modules.recommendergenerator\n", + "\n", + "# create recommender object with the default use case\n", + "recommender_str = \"ALS\"\n", + "recommender = modules.recommendergenerator.RecommenderFactory().create_obj_recommender(recommender_str, uservector)\n", + "# or\n", + "# modules.recommendergenerator.RecommenderFactory().create_obj_recommender(recommender, uservector, Default())\n", + "\n", + "# get the prediction vector\n", + "prediction_vector = recommender.make_prediction()\n", + "# or\n", + "# prediction_vector = uservector.prediction\n", + "predicted1 = prediction_vector" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "[Rating(user=36455, product=12, rating=3.1620100630939234),\n", + " Rating(user=13019, product=12, rating=3.009068937170033),\n", + " Rating(user=1199, product=12, rating=1.889880680902047),\n", + " Rating(user=56039, product=12, rating=1.8340114917394583),\n", + " Rating(user=68279, product=12, rating=2.575869762437719)]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prediction_vector.cache()\n", + "predicted1.cache()\n", + "\n", + "print type(predicted1)\n", + "predicted1.take(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import algorithms.cf\n", + "\n", + "# instead of doing the step above, you can also call the function directly\n", + "prediction_vector = algorithms.cf.calc_cf_mllib(uservector.training_vector)\n", + "predicted2 = prediction_vector" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "[Rating(user=22502, product=12, rating=2.145246574980865),\n", + " Rating(user=22514, product=12, rating=1.8239622809024438),\n", + " Rating(user=22526, product=12, rating=1.6218700820020784),\n", + " Rating(user=22538, product=12, rating=3.22630662094852),\n", + " Rating(user=22550, product=12, rating=2.568704193724831)]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print type(predicted2)\n", + "predicted2.take(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# both ways are the same thing as\n", + "# predicted = algorithms.cf.calc_cf_mllib(uservector.training_vector)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 2: you execute using the __make_prediction() function" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n# TODO: will implement later\\n'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from hermes import *\n", + "\n", + "cargo.user_recommenders = [\"ALS\"]\n", + "cargo.content_recommenders = []\n", + "\n", + "# call make_prediction function\n", + "hermes.__make_prediction(cargo)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "mv = cargo.vectors[0]\n", + "prediction_vector = mv.prediction_vector\n", + "print type(prediction_vector)\n", + "prediction_vector.take(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 5: __calculate_metrics()\n", + "**For those who use [MovieLens 1M CF test src code](http://l41-srv-mcdh32.b.internal:8880/notebooks/Hermes/MovieLens%201M%20CF%20test%20src%20code.ipynb#) as guidance, this is accomplishing cell # 11.**\n", + "\n", + "Function: \n", + "* __calculate_metrics() tests the metrics specified by the user. \n", + "* __calculate_metrics() implements the calculate_metrics_state of the state machine.\n", + "\n", + "```bash\n", + "def __calculate_metrics(cargo):\n", + " \"\"\"calculate_metrics_state without the state machine.\"\"\"\n", + "\n", + " if Globals.verbose: Globals.logger.debug(\"In calculate_metrics_state:\")\n", + "\n", + " # create a metric executor\n", + " executor = mg.MetricExecutor(mg.Metric())\n", + "\n", + " for i in range(0, len(cargo.vectors)):\n", + " Globals.logger.info(\"-\" * 80)\n", + " Globals.logger.info(\"Data: %s\" % cargo.vectors[i].data.datapath)\n", + " for m in cargo.metrics:\n", + " # check if metric exists\n", + " metric = mg.MetricFactory().create_obj_metric(m)\n", + " # set metric in executor\n", + " executor.change_metric(metric)\n", + " # execute the metric\n", + " with Timer() as t:\n", + " Globals.logger.info(\"Metric: %s = %f\" % (m, executor.execute(cargo.vectors[i])))\n", + " if Globals.verbose: Globals.logger.debug(\"Calculating metric takes %s seconds\" % t.secs)\n", + " Globals.logger.info(\"-\" * 80)\n", + "\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 1: You implement what is already in __calculate_metrics() manually yourself" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import modules.metricgenerator \n", + "\n", + "# create metric executor\n", + "executor = modules.metricgenerator.MetricExecutor(modules.metricgenerator.Metric())\n", + "\n", + "# create metric object\n", + "metric_str = \"RMSE\"\n", + "rmse_metric = modules.metricgenerator.MetricFactory().create_obj_metric(metric_str)\n", + "\n", + "# set metric in executor \n", + "executor.change_metric(rmse_metric)\n", + "\n", + "# calculate metric\n", + "rmse = executor.execute(uservector)\n", + "\n", + "print \"RMSE: \", rmse\n", + "\n", + "# switch metric object\n", + "metric_str = \"MAE\"\n", + "mae_metric = modules.metricgenerator.MetricFactory().create_obj_metric(metric_str)\n", + "executor.change_metric(mae_metric)\n", + "\n", + "# calculate metric\n", + "mae = executor.execute(uservector)\n", + "\n", + "print \"MAE: \", mae\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "import algorithms.performance_metrics\n", + "\n", + "# instead of doing the step above, you can also call the function directly\n", + "rmse = algorithms.performance_metrics.calculate_rmse(uservector.test_vector, uservector.prediction_vector)\n", + "print \"RMSE: \", rmse\n", + "\n", + "mae = algorithms.performance_metrics.calculate_mae(uservector.test_vector, uservector.prediction_vector)\n", + "print \"MAE: \", mae" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# both ways are the same thing as\n", + "# rmse = algorithms.performance_metrics.calculate_rmse(uservector.test_vector, uservector.prediction_vector)\n", + "# mae = algorithms.performance_metrics.calculate_mae(uservector.test_vector, uservector.prediction_vector)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Option 2: you execute using the __calculate_metrics() function" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from hermes import *\n", + "\n", + "cargo.metrics = [\"ALS\"]\n", + "\n", + "# call calculate_metrics function\n", + "hermes.__calculate_metrics(cargo)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6472b9a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,28 @@ +boto==2.36.0 +click==5.1 +cycler==0.9.0 +docopt==0.6.2 +hermes==1.0 +matplotlib==1.5.0 +mesos==0.25.0 +mesos.cli==0.25.0 +mesos.interface==0.25.0 +mesos.native==0.25.0 +numpy==1.10.1 +pandas==0.17.0 +pbr==1.8.1 +protobuf==2.6.1 +psutil==3.2.2 +py==1.4.30 +pyparsing==2.0.5 +pytest==2.8.2 +python-dateutil==2.4.2 +pytz==2015.7 +scikit-learn==0.17 +scipy==0.16.1 +six==1.10.0 +stevedore==1.9.0 +virtualenv==13.1.2 +virtualenv-clone==0.2.6 +virtualenvwrapper==4.7.1 +wheel==0.26.0 diff --git a/scripts/list_requirements.sh b/scripts/list_requirements.sh new file mode 100755 index 0000000..ef0028b --- /dev/null +++ b/scripts/list_requirements.sh @@ -0,0 +1 @@ +pip freeze > $PWD/../requirements.txt diff --git a/scripts/run_once.sh b/scripts/run_once.sh new file mode 100755 index 0000000..4278dc6 --- /dev/null +++ b/scripts/run_once.sh @@ -0,0 +1,2 @@ +python setup.py install +pip install --editable . diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4ce2f1e --- /dev/null +++ b/setup.py @@ -0,0 +1,76 @@ +from __future__ import print_function +from setuptools import setup, find_packages +from setuptools.command.test import test as TestCommand +import codecs +import os +import sys +import re + +here = os.path.abspath(os.path.dirname(__file__)) + +def read(*parts): + # intentionally *not* adding an encoding option to open + return codecs.open(os.path.join(here, *parts), 'r').read() + +def find_version(*file_paths): + version_file = read(*file_paths) + version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", + version_file, re.M) + if version_match: + return version_match.group(1) + raise RuntimeError("Unable to find version string.") + +long_description = read('README.md') + +class PyTest(TestCommand): + def finalize_options(self): + TestCommand.finalize_options(self) + self.test_args = ['--strict', '--verbose', '--tb=long', 'tests'] # tests if rename src to hermes + self.test_suite = True + + def run_tests(self): + import pytest + errno = pytest.main(self.test_args) + sys.exit(errno) + +setup( + name='hermes', + version=find_version('hermes', '__init__.py'), + url='http://github.com/lab41/hermes/', + license='Apache Software License', + author='Lab 41', + description='Exploration of Recommender Systems', + long_description=long_description, + tests_require=['pytest'], + install_requires=['click', + ], + cmdclass={'test': PyTest}, + entry_points={ + 'console_scripts': [ + 'hermes = hermes.hermesctl:main', + ], + }, + py_modules=['hermes'], + #scripts=['scripts/somescript.py'], + packages=['hermes', 'hermes.modules', 'hermes.metrics', 'hermes.utils'], + include_package_data=True, + platforms='any', + test_suite='tests.test_hermes.py', + zip_safe=False, + #package_data={'hermes': ['templates/**', 'static/*/*']}, + classifiers = [ + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 3', + 'Development Status :: 1', + 'Natural Language :: English', + 'Environment :: Spark Environment', + 'Intended Audience :: Developers, Data Scientists', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: MAC OS X', + 'Topic :: Recommender System', + ], + extras_require={ + 'testing': ['pytest'], + } +) diff --git a/src/examples/timer.py b/src/examples/timer.py deleted file mode 100644 index d61bb2d..0000000 --- a/src/examples/timer.py +++ /dev/null @@ -1,16 +0,0 @@ -import time - -class Timer(object): - def __init__(self, verbose=False): - self.verbose = verbose - - def __enter__(self): - self.start = time.time() - return self - - def __exit__(self, *args): - self.end = time.time() - self.secs = self.end - self.start - self.msecs = self.secs * 1000 - if self.verbose: - print "elapsed time: %f ms" % self.msecs \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_hermes.py b/tests/test_hermes.py new file mode 100644 index 0000000..e69de29