Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
language: python
python:
- "2.6"
- "2.7"
- "3.6"
before_install:
# install NumPy dependencies
- sudo apt-get update -qq
Expand All @@ -10,4 +9,4 @@ before_install:
install:
- python setup.py install
script:
- nosetests tests
- nosetests tests
4 changes: 2 additions & 2 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
include README.md
include ez_setup.py
recursive-include pytldr/stopwords *
include LICENSE.txt
recursive-include pytldr/stopwords *
18 changes: 10 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
[![Build Status](https://travis-ci.org/jaijuneja/PyTLDR.svg?branch=master)](https://travis-ci.org/jaijuneja/PyTLDR) [![PyPI version](https://badge.fury.io/py/pytldr.svg)](https://pypi.python.org/pypi/pytldr)

A Python module to perform automatic summarization of articles, text files and web pages.
This repository is fork of Jai Juneja's original repo PyTLDR

## License

Expand All @@ -18,17 +19,16 @@ You should have received a copy of the GNU General Public License along with thi

### Using pip or easy_install

You can download the latest release version using `pip` or `easy_install`:

~~You can download the latest release version using `pip` or `easy_install`:~~
see issue [here](https://github.com/jaijuneja/PyTLDR/issues/1)
```
pip install pytldr
```

### Latest development version
You can alternatively download the latest development version directly from GitHub:
### Latest development version in python3
You can alternatively download the latest development version directly from GitHub for python3:

```
git clone https://github.com/jaijuneja/PyTLDR.git
git clone https://github.com/vinodnimbalkar/PyTLDR.git
```

Change into the root directory:
Expand All @@ -40,7 +40,9 @@ cd pytldr
Then install the package:

```
python setup.py install
uv venv -p 3.12
source .venv/bin/activate
uv pip install -e .
```

## Usage
Expand Down Expand Up @@ -154,4 +156,4 @@ help(RelevanceSummarizer)

## Contact

If you have any questions or have encountered an error, feel free to contact me at `jai -dot- juneja -at- gmail -dot- com`.
If you have any questions or have encountered an error, feel free to contact me at `jai -dot- juneja -at- gmail -dot- com`.
16 changes: 8 additions & 8 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,26 +76,26 @@
relevance = RelevanceSummarizer()
textrank = TextRankSummarizer()

print '\n\nLSA Ozsoy:\n'
print('\n\nLSA Ozsoy:\n')
summary = lsa_o.summarize(txt, length=5)

for sentence in summary:
print sentence
print(sentence)

print '\n\nLSA Steinberger:\n'
print('\n\nLSA Steinberger:\n')
summary = lsa_s.summarize(txt, length=5)

for sentence in summary:
print sentence
print(sentence)

print '\n\nRelevance:\n'
print('\n\nRelevance:\n')
summary = relevance.summarize(txt, length=5)

for sentence in summary:
print sentence
print(sentence)

print '\n\nTextRank:\n'
print('\n\nTextRank:\n')
summary = textrank.summarize(txt, length=5)

for sentence in summary:
print sentence
print(sentence)
2 changes: 1 addition & 1 deletion ez_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
from urllib.request import urlopen

try:
from site import USER_SITE
Expand Down
54 changes: 54 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
[build-system]
requires = ["setuptools>=61.0", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "PyTLDR"
version = "0.1.5"
description = "A module to perform automatic article summarization."
readme = "README.md"
requires-python = ">=3.12"
license = { file = "LICENSE.txt" }
authors = [
{ name = "Jai Juneja", email = "jai.juneja@gmail.com" },
]
keywords = [
"summarizer",
"summarization",
"natural language processing",
"nlp",
"machine learning",
"data mining",
"latent semantic analysis",
"lsa",
]
classifiers = [
"Development Status :: 3 - Alpha",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
"Topic :: Text Processing :: Filters",
"Topic :: Text Processing :: Linguistic",
"Intended Audience :: Developers",
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.12",
]
urls = { Homepage = "https://github.com/vinodnimbalkar/PyTLDR", Repository = "https://github.com/vinodnimbalkar/PyTLDR" }
dependencies = [
"numpy>=1.14.3",
"nltk>=3.4",
"scipy>=1.1.0",
"scikit-learn>=0.20.1",
"networkx>=2.2",
]

[tool.setuptools]
include-package-data = true

[tool.setuptools.packages.find]
include = ["pytldr*"]

[tool.setuptools.package-data]
"pytldr" = ["stopwords/*.txt"]
16 changes: 9 additions & 7 deletions pytldr/nlp/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,18 @@


def unicode_to_ascii(unicodestr):
if isinstance(unicodestr, bytes):
unicodestr = unicodestr.decode('utf-8', errors='ignore')
if isinstance(unicodestr, str):
return unicodestr
elif isinstance(unicodestr, unicode):
return unicodedata.normalize('NFKD', unicodestr).encode('ascii', 'ignore')
else:
raise ValueError('Input text must be of type str or unicode.')
normalized = unicodedata.normalize('NFKD', unicodestr)
return normalized.encode('ascii', 'ignore').decode('ascii')
raise ValueError('Input text must be of type str or bytes.')


def parse_input(text, extractor='newspaper'):
if isinstance(text, str) or isinstance(text, unicode):
if isinstance(text, bytes):
text = text.decode('utf-8', errors='ignore')
if isinstance(text, str):
if text.startswith(('http://', 'https://')):
# Input is a link - need to extract the text from html
if extractor.lower() == 'goose':
Expand All @@ -36,4 +38,4 @@ def parse_input(text, extractor='newspaper'):
# Input is a string containing the raw text
return unicode_to_ascii(text)
else:
raise ValueError('Input text must be of type str or unicode.')
raise ValueError('Input text must be of type str or bytes.')
10 changes: 5 additions & 5 deletions pytldr/nlp/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from nltk.stem import SnowballStemmer
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from string import punctuation
from preprocess import unicode_to_ascii
from .preprocess import unicode_to_ascii


class Tokenizer(object):
Expand All @@ -17,7 +17,7 @@ def __init__(self, language='english', stopwords=None, stemming=True):

if isinstance(stopwords, list):
self._stopwords = stopwords
elif isinstance(stopwords, (str, unicode)):
elif isinstance(stopwords, str):
# stopwords argument is a path
try:
self._stopwords = self._load_stopwords(stopwords)
Expand All @@ -42,7 +42,7 @@ def stemmer(self):
@staticmethod
def _load_stopwords(file_path):
try:
with open(file_path, 'rb') as stopwords_file:
with open(file_path, 'r') as stopwords_file:
stopwords = [word.strip('\n') for word in stopwords_file.readlines()]
except IOError:
stopwords = []
Expand Down Expand Up @@ -109,7 +109,7 @@ def _remove_whitespace(text):
if not non_spaces:
return text

first_non_space = non_spaces.next()
first_non_space = next(non_spaces)
first_non_space = first_non_space.start()

last_non_space = None
Expand Down Expand Up @@ -179,4 +179,4 @@ def tokenize_paragraphs(cls, text):

# Remove empty strings from list
paragraphs = [p for p in paragraphs if p]
return paragraphs
return paragraphs
4 changes: 1 addition & 3 deletions pytldr/summarize/baseclass.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
from abc import ABCMeta, abstractmethod


class BaseSummarizer(object):
__metaclass__ = ABCMeta

class BaseSummarizer(object, metaclass=ABCMeta):
def __init__(self, tokenizer=Tokenizer('english')):
self._tokenizer = tokenizer

Expand Down
4 changes: 2 additions & 2 deletions pytldr/summarize/lsa.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
import numpy as np
from baseclass import BaseSummarizer
from .baseclass import BaseSummarizer
from scipy.sparse.linalg import svds
from warnings import warn

Expand Down Expand Up @@ -168,4 +168,4 @@ def summarize(self, text, topics=4, length=5, binary_matrix=True, topic_sigma_th


# Default LsaSummarizer just uses the Ozsoy method
LsaSummarizer = LsaOzsoy
LsaSummarizer = LsaOzsoy
14 changes: 10 additions & 4 deletions pytldr/summarize/textrank.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
# -*- coding: utf-8 -*-
from __future__ import division

import networkx
from .baseclass import BaseSummarizer


try:
from networkx import from_scipy_sparse_array as _from_scipy_sparse
except ImportError: # networkx<3.0
from networkx import from_scipy_sparse_matrix as _from_scipy_sparse


class TextRankSummarizer(BaseSummarizer):

def summarize(self, text, length=5, weighting='frequency', norm=None):
Expand Down Expand Up @@ -36,14 +42,14 @@ def summarize(self, text, length=5, weighting='frequency', norm=None):
# combinations of sentences.
similarity_matrix = (word_matrix * word_matrix.T)

similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix)
similarity_graph = _from_scipy_sparse(similarity_matrix)
scores = networkx.pagerank(similarity_graph)

ranked_sentences = sorted(
((score, ndx) for ndx, score in scores.items()), reverse=True
((score, ndx) for ndx, score in list(scores.items())), reverse=True
)

top_sentences = [ranked_sentences[i][1] for i in range(length)]
top_sentences.sort()

return [unprocessed_sentences[i] for i in top_sentences]
return [unprocessed_sentences[i] for i in top_sentences]
7 changes: 0 additions & 7 deletions requirements.txt

This file was deleted.

73 changes: 2 additions & 71 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,73 +1,4 @@
# -*- coding: utf-8 -*-
import ez_setup
ez_setup.use_setuptools(version='0.7')

from setuptools import setup
import os

PACKAGE_NAME = 'PyTLDR'
VERSION = '0.1.5'


def read(filename):
filepath = os.path.join(os.path.dirname(__file__), filename)
try:
# Convert GitHub markdown to restructured text (needed for upload to PyPI)
from pypandoc import convert
return convert(filepath, 'rst')
except ImportError:
return open(filepath).read()

description = 'A module to perform automatic article summarization.'
try:
long_description = read('README.md')
except IOError:
long_description = description

setup(
name=PACKAGE_NAME,
version=VERSION,
author='Jai Juneja',
author_email='jai.juneja@gmail.com',
description=description,
license='BSD',
keywords= [
'summarizer', 'summarization', 'natural language processing', 'nlp',
'machine learning', 'data mining', 'latent semantic analysis', 'lsa'
],
url='https://github.com/jaijuneja/PyTLDR',
packages=[
'pytldr',
'pytldr.nlp',
'pytldr.summarize'
],
long_description=long_description,
classifiers=[
'Development Status :: 3 - Alpha',
'Topic :: Scientific/Engineering :: Artificial Intelligence',
'Topic :: Scientific/Engineering :: Information Analysis',
'Topic :: Text Processing :: Filters',
'Topic :: Text Processing :: Linguistic',
'Intended Audience :: Developers',
'License :: OSI Approved :: BSD License',
'Operating System :: OS Independent',
'Programming Language :: Python'
],
install_requires=[
'numpy==1.8.0',
'nltk==2.0.5',
'scipy==0.13.2',
'scikit-learn==0.15.2',
'goose-extractor==1.0.25',
'newspaper==0.0.9.8',
'networkx==1.9.1'
],
include_package_data=True,
package_data={PACKAGE_NAME: ['stopwords/*.txt'],
'': ['README.md', 'ez_setup.py']},
tests_require=[
'nose',
'coverage',
],
test_suite='nose.collector'
)
if __name__ == "__main__":
setup()
Loading