jaijuneja · vinodnimbalkar · Jan 9, 2019 · Jan 9, 2019 · Jan 9, 2019 · Jan 9, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -1,7 +1,6 @@
 language: python
 python:
-  - "2.6"
-  - "2.7"
+  - "3.6"
 before_install:
   # install NumPy dependencies
   - sudo apt-get update -qq
@@ -10,4 +9,4 @@ before_install:
 install:
   - python setup.py install
 script:
-  - nosetests tests
+  - nosetests tests
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,3 +1,3 @@
 include README.md
-include ez_setup.py
-recursive-include pytldr/stopwords *
+include LICENSE.txt
+recursive-include pytldr/stopwords *
diff --git a/README.md b/README.md
@@ -3,6 +3,7 @@
 [![Build Status](https://travis-ci.org/jaijuneja/PyTLDR.svg?branch=master)](https://travis-ci.org/jaijuneja/PyTLDR) [![PyPI version](https://badge.fury.io/py/pytldr.svg)](https://pypi.python.org/pypi/pytldr)
 
 A Python module to perform automatic summarization of articles, text files and web pages.
+This repository is fork of Jai Juneja's original repo PyTLDR
 
 ## License
 
@@ -18,17 +19,16 @@ You should have received a copy of the GNU General Public License along with thi
 
 ### Using pip or easy_install
 
-You can download the latest release version using `pip` or `easy_install`:
-
+~~You can download the latest release version using `pip` or `easy_install`:~~
+see issue [here](https://github.com/jaijuneja/PyTLDR/issues/1)
 ```
 pip install pytldr
 ```
-
-### Latest development version
-You can alternatively download the latest development version directly from GitHub:
+### Latest development version in python3
+You can alternatively download the latest development version directly from GitHub for python3:
 
 ```
-git clone https://github.com/jaijuneja/PyTLDR.git
+git clone https://github.com/vinodnimbalkar/PyTLDR.git
 ```
 
 Change into the root directory:
@@ -40,7 +40,9 @@ cd pytldr
 Then install the package:
 
 ```
-python setup.py install
+uv venv -p 3.12
+source .venv/bin/activate
+uv pip install -e .
 ```
 
 ## Usage
@@ -154,4 +156,4 @@ help(RelevanceSummarizer)
 
 ## Contact
 
-If you have any questions or have encountered an error, feel free to contact me at `jai -dot- juneja -at- gmail -dot- com`.
+If you have any questions or have encountered an error, feel free to contact me at `jai -dot- juneja -at- gmail -dot- com`.
diff --git a/example.py b/example.py
@@ -76,26 +76,26 @@
     relevance = RelevanceSummarizer()
     textrank = TextRankSummarizer()
 
-    print '\n\nLSA Ozsoy:\n'
+    print('\n\nLSA Ozsoy:\n')
     summary = lsa_o.summarize(txt, length=5)
 
     for sentence in summary:
-        print sentence
+        print(sentence)
 
-    print '\n\nLSA Steinberger:\n'
+    print('\n\nLSA Steinberger:\n')
     summary = lsa_s.summarize(txt, length=5)
 
     for sentence in summary:
-        print sentence
+        print(sentence)
 
-    print '\n\nRelevance:\n'
+    print('\n\nRelevance:\n')
     summary = relevance.summarize(txt, length=5)
 
     for sentence in summary:
-        print sentence
+        print(sentence)
 
-    print '\n\nTextRank:\n'
+    print('\n\nTextRank:\n')
     summary = textrank.summarize(txt, length=5)
 
     for sentence in summary:
-        print sentence
+        print(sentence)
diff --git a/ez_setup.py b/ez_setup.py
@@ -29,7 +29,7 @@
 try:
     from urllib.request import urlopen
 except ImportError:
-    from urllib2 import urlopen
+    from urllib.request import urlopen
 
 try:
     from site import USER_SITE

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,54 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "PyTLDR"
+version = "0.1.5"
+description = "A module to perform automatic article summarization."
+readme = "README.md"
+requires-python = ">=3.12"
+license = { file = "LICENSE.txt" }
+authors = [
+  { name = "Jai Juneja", email = "jai.juneja@gmail.com" },
+]
+keywords = [
+  "summarizer",
+  "summarization",
+  "natural language processing",
+  "nlp",
+  "machine learning",
+  "data mining",
+  "latent semantic analysis",
+  "lsa",
+]
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+  "Topic :: Scientific/Engineering :: Information Analysis",
+  "Topic :: Text Processing :: Filters",
+  "Topic :: Text Processing :: Linguistic",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
+  "Operating System :: OS Independent",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.12",
+]
+urls = { Homepage = "https://github.com/vinodnimbalkar/PyTLDR", Repository = "https://github.com/vinodnimbalkar/PyTLDR" }
+dependencies = [
+  "numpy>=1.14.3",
+  "nltk>=3.4",
+  "scipy>=1.1.0",
+  "scikit-learn>=0.20.1",
+  "networkx>=2.2",
+]
+
+[tool.setuptools]
+include-package-data = true
+
+[tool.setuptools.packages.find]
+include = ["pytldr*"]
+
+[tool.setuptools.package-data]
+"pytldr" = ["stopwords/*.txt"]
diff --git a/pytldr/nlp/preprocess.py b/pytldr/nlp/preprocess.py
@@ -3,16 +3,18 @@
 
 
 def unicode_to_ascii(unicodestr):
+    if isinstance(unicodestr, bytes):
+        unicodestr = unicodestr.decode('utf-8', errors='ignore')
     if isinstance(unicodestr, str):
-        return unicodestr
-    elif isinstance(unicodestr, unicode):
-        return unicodedata.normalize('NFKD', unicodestr).encode('ascii', 'ignore')
-    else:
-        raise ValueError('Input text must be of type str or unicode.')
+        normalized = unicodedata.normalize('NFKD', unicodestr)
+        return normalized.encode('ascii', 'ignore').decode('ascii')
+    raise ValueError('Input text must be of type str or bytes.')
 
 
 def parse_input(text, extractor='newspaper'):
-    if isinstance(text, str) or isinstance(text, unicode):
+    if isinstance(text, bytes):
+        text = text.decode('utf-8', errors='ignore')
+    if isinstance(text, str):
         if text.startswith(('http://', 'https://')):
             # Input is a link - need to extract the text from html
             if extractor.lower() == 'goose':
@@ -36,4 +38,4 @@ def parse_input(text, extractor='newspaper'):
             # Input is a string containing the raw text
             return unicode_to_ascii(text)
     else:
-        raise ValueError('Input text must be of type str or unicode.')
+        raise ValueError('Input text must be of type str or bytes.')
diff --git a/pytldr/nlp/tokenizer.py b/pytldr/nlp/tokenizer.py
@@ -4,7 +4,7 @@
 from nltk.stem import SnowballStemmer
 from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
 from string import punctuation
-from preprocess import unicode_to_ascii
+from .preprocess import unicode_to_ascii
 
 
 class Tokenizer(object):
@@ -17,7 +17,7 @@ def __init__(self, language='english', stopwords=None, stemming=True):
 
         if isinstance(stopwords, list):
             self._stopwords = stopwords
-        elif isinstance(stopwords, (str, unicode)):
+        elif isinstance(stopwords, str):
             # stopwords argument is a path
             try:
                 self._stopwords = self._load_stopwords(stopwords)
@@ -42,7 +42,7 @@ def stemmer(self):
     @staticmethod
     def _load_stopwords(file_path):
         try:
-            with open(file_path, 'rb') as stopwords_file:
+            with open(file_path, 'r') as stopwords_file:
                 stopwords = [word.strip('\n') for word in stopwords_file.readlines()]
         except IOError:
             stopwords = []
@@ -109,7 +109,7 @@ def _remove_whitespace(text):
         if not non_spaces:
             return text
 
-        first_non_space = non_spaces.next()
+        first_non_space = next(non_spaces)
         first_non_space = first_non_space.start()
 
         last_non_space = None
@@ -179,4 +179,4 @@ def tokenize_paragraphs(cls, text):
 
         # Remove empty strings from list
         paragraphs = [p for p in paragraphs if p]
-        return paragraphs
+        return paragraphs
diff --git a/pytldr/summarize/baseclass.py b/pytldr/summarize/baseclass.py
@@ -5,9 +5,7 @@
 from abc import ABCMeta, abstractmethod
 
 
-class BaseSummarizer(object):
-    __metaclass__ = ABCMeta
-
+class BaseSummarizer(object, metaclass=ABCMeta):
     def __init__(self, tokenizer=Tokenizer('english')):
         self._tokenizer = tokenizer
 

diff --git a/pytldr/summarize/lsa.py b/pytldr/summarize/lsa.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 import numpy as np
-from baseclass import BaseSummarizer
+from .baseclass import BaseSummarizer
 from scipy.sparse.linalg import svds
 from warnings import warn
 
@@ -168,4 +168,4 @@ def summarize(self, text, topics=4, length=5, binary_matrix=True, topic_sigma_th
 
 
 # Default LsaSummarizer just uses the Ozsoy method
-LsaSummarizer = LsaOzsoy
+LsaSummarizer = LsaOzsoy
diff --git a/pytldr/summarize/textrank.py b/pytldr/summarize/textrank.py
@@ -1,9 +1,15 @@
 # -*- coding: utf-8 -*-
-from __future__ import division
+
 import networkx
 from .baseclass import BaseSummarizer
 
 
+try:
+    from networkx import from_scipy_sparse_array as _from_scipy_sparse
+except ImportError:  # networkx<3.0
+    from networkx import from_scipy_sparse_matrix as _from_scipy_sparse
+
+
 class TextRankSummarizer(BaseSummarizer):
 
     def summarize(self, text, length=5, weighting='frequency', norm=None):
@@ -36,14 +42,14 @@ def summarize(self, text, length=5, weighting='frequency', norm=None):
         # combinations of sentences.
         similarity_matrix = (word_matrix * word_matrix.T)
 
-        similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
+        similarity_graph = _from_scipy_sparse(similarity_matrix)
         scores = networkx.pagerank(similarity_graph)
 
         ranked_sentences = sorted(
-            ((score, ndx) for ndx, score in scores.items()), reverse=True
+            ((score, ndx) for ndx, score in list(scores.items())), reverse=True
         )
 
         top_sentences = [ranked_sentences[i][1] for i in range(length)]
         top_sentences.sort()
 
-        return [unprocessed_sentences[i] for i in top_sentences]
+        return [unprocessed_sentences[i] for i in top_sentences]
diff --git a/requirements.txt b/requirements.txt
diff --git a/setup.py b/setup.py
@@ -1,73 +1,4 @@
-# -*- coding: utf-8 -*-
-import ez_setup
-ez_setup.use_setuptools(version='0.7')
-
 from setuptools import setup
-import os
-
-PACKAGE_NAME = 'PyTLDR'
-VERSION = '0.1.5'
-
-
-def read(filename):
-    filepath = os.path.join(os.path.dirname(__file__), filename)
-    try:
-        # Convert GitHub markdown to restructured text (needed for upload to PyPI)
-        from pypandoc import convert
-        return convert(filepath, 'rst')
-    except ImportError:
-        return open(filepath).read()
-
-description = 'A module to perform automatic article summarization.'
-try:
-    long_description = read('README.md')
-except IOError:
-    long_description = description
 
-setup(
-    name=PACKAGE_NAME,
-    version=VERSION,
-    author='Jai Juneja',
-    author_email='jai.juneja@gmail.com',
-    description=description,
-    license='BSD',
-    keywords= [
-        'summarizer', 'summarization', 'natural language processing', 'nlp',
-        'machine learning', 'data mining', 'latent semantic analysis', 'lsa'
-    ],
-    url='https://github.com/jaijuneja/PyTLDR',
-    packages=[
-        'pytldr',
-        'pytldr.nlp',
-        'pytldr.summarize'
-    ],
-    long_description=long_description,
-    classifiers=[
-        'Development Status :: 3 - Alpha',
-        'Topic :: Scientific/Engineering :: Artificial Intelligence',
-        'Topic :: Scientific/Engineering :: Information Analysis',
-        'Topic :: Text Processing :: Filters',
-        'Topic :: Text Processing :: Linguistic',
-        'Intended Audience :: Developers',
-        'License :: OSI Approved :: BSD License',
-        'Operating System :: OS Independent',
-        'Programming Language :: Python'
-    ],
-    install_requires=[
-        'numpy==1.8.0',
-        'nltk==2.0.5',
-        'scipy==0.13.2',
-        'scikit-learn==0.15.2',
-        'goose-extractor==1.0.25',
-        'newspaper==0.0.9.8',
-        'networkx==1.9.1'
-    ],
-    include_package_data=True,
-    package_data={PACKAGE_NAME: ['stopwords/*.txt'],
-                  '': ['README.md', 'ez_setup.py']},
-    tests_require=[
-        'nose',
-        'coverage',
-    ],
-    test_suite='nose.collector'
-)
+if __name__ == "__main__":
+    setup()