diff --git a/htmlark.py b/htmlark.py
index 02c2918..e1d09ff 100755
--- a/htmlark.py
+++ b/htmlark.py
@@ -2,6 +2,9 @@
"""Embed images, CSS, and JavaScript into an HTML file, using data URIs."""
__version__ = "1.0.0"
+import os
+import string
+import pathlib
import argparse
import base64
from datetime import datetime
@@ -13,6 +16,7 @@
from urllib.parse import urlparse
import bs4
+
# Import requests if available, dummy it if not
try:
from requests import get as requests_get
@@ -22,9 +26,11 @@
class RequestException(Exception): # NOQA make flake8 shut up
"""Dummy exception for when Requests is not installed."""
+
pass
-PARSERS = ['lxml', 'html5lib', 'html.parser']
+
+PARSERS = ["lxml", "html5lib", "html.parser"]
def get_available_parsers():
@@ -53,23 +59,24 @@ def _get_resource(resource_url: str) -> (str, bytes):
ValueError: If ``resource_url``'s protocol is invalid.
"""
url_parsed = urlparse(resource_url)
- if url_parsed.scheme in ['http', 'https']:
+ if url_parsed.scheme in ["http", "https"]:
# Requests might not be installed
if requests_get is not None:
request = requests_get(resource_url)
data = request.content
- if 'Content-Type' in request.headers:
- mimetype = request.headers['Content-Type']
+ if "Content-Type" in request.headers:
+ mimetype = request.headers["Content-Type"]
else:
mimetype = mimetypes.guess_type(resource_url)
else:
raise NameError("HTTP URL found but requests not available")
- elif url_parsed.scheme == '':
+ elif url_parsed.scheme in string.ascii_letters:
# '' is local file
- with open(resource_url, 'rb') as f:
+ # null-string("") is in any string
+ with open(resource_url, "rb") as f:
data = f.read()
mimetype, _ = mimetypes.guess_type(resource_url)
- elif url_parsed.scheme == 'data':
+ elif url_parsed.scheme == "data":
raise ValueError("Resource path is a data URI", url_parsed.scheme)
else:
raise ValueError("Not local path or HTTP/HTTPS URL", url_parsed.scheme)
@@ -88,20 +95,26 @@ def make_data_uri(mimetype: str, data: bytes) -> str:
Returns:
str: Input data encoded into a data URI.
"""
- mimetype = '' if mimetype is None else mimetype
- if mimetype in ['', 'text/css', 'application/javascript']:
+ mimetype = "" if mimetype is None else mimetype
+ if mimetype in ["", "text/css", "application/javascript"]:
# Text data can simply be URL-encoded
encoded_data = quote(data.decode())
else:
- mimetype = mimetype + ';base64'
+ mimetype = mimetype + ";base64"
encoded_data = base64.b64encode(data).decode()
return "data:{},{}".format(mimetype, encoded_data)
-def convert_page(page_path: str, parser: str='auto',
- callback: Callable[[str, str, str], None]=lambda *_: None,
- ignore_errors: bool=False, ignore_images: bool=False,
- ignore_css: bool=False, ignore_js: bool=False) -> str:
+def convert_page(
+ page_path: str,
+ parser: str = "auto",
+ callback: Callable[[str, str, str], None] = lambda *_: None,
+ ignore_errors: bool = False,
+ ignore_images: bool = False,
+ ignore_css: bool = False,
+ ignore_js: bool = False,
+ charset: str = "utf-8",
+) -> str:
"""Take an HTML file or URL and outputs new HTML with resources as data URIs.
Parameters:
@@ -165,7 +178,7 @@ def convert_page(page_path: str, parser: str='auto',
"""
# Check features
if requests_get is None:
- callback('INFO', 'feature', "Requests not available, web downloading disabled")
+ callback("INFO", "feature", "Requests not available, web downloading disabled")
# Get page HTML, whether from a server, a local file, or stdin
if page_path is None:
@@ -176,109 +189,165 @@ def convert_page(page_path: str, parser: str='auto',
# Not all parsers are equal - it can be specified on the command line
# so the user can try another when one fails
- if parser == 'auto':
+ if parser == "auto":
parser = get_available_parsers()[0]
- soup = bs4.BeautifulSoup(page_text, parser)
- callback('INFO', 'parser', "Using parser " + parser)
+ soup = bs4.BeautifulSoup(page_text.decode(charset), parser)
+ callback("INFO", "parser", "Using parser " + parser)
tags = []
# Gather all the relevant tags together
if not ignore_images:
- tags += soup('img')
+ tags += soup("img")
if not ignore_css:
- csstags = soup('link')
+ csstags = soup("link")
for css in csstags:
- if 'stylesheet' in css['rel']:
+ if "stylesheet" in css["rel"]:
tags.append(css)
if not ignore_js:
- scripttags = soup('script')
+ scripttags = soup("script")
for script in scripttags:
- if 'src' in script.attrs:
+ if "src" in script.attrs:
tags.append(script)
# Convert the linked resources
for tag in tags:
- tag_url = tag['href'] if tag.name == 'link' else tag['src']
+ tag_url = tag["href"] if tag.name == "link" else tag["src"]
try:
- # BUG: doesn't work if using relative remote URLs in a local file
- fullpath = urljoin(page_path, tag_url)
+ if not bool(urlparse(page_path).netloc):
+ dir_path = pathlib.Path(page_path).parent.absolute()
+ fullpath = os.path.join(dir_path, tag_url)
+ else:
+ fullpath = urljoin(page_path, tag_url)
tag_mime, tag_data = _get_resource(fullpath)
except RequestException:
- callback('ERROR', tag.name, "Can't access URL " + fullpath)
+ callback("ERROR", tag.name, "Can't access URL " + fullpath)
if not ignore_errors:
raise
except OSError as e:
- callback('ERROR', tag.name, "Error reading '{}': {}".format(e.filename, e.strerror))
+ callback(
+ "ERROR",
+ tag.name,
+ "Error reading '{}': {}".format(e.filename, e.strerror),
+ )
if not ignore_errors:
raise
except ValueError as e:
# Raised when a problem with the URL is found
scheme = e.args[1]
# Don't need to process things that are already data URIs
- if scheme == 'data':
- callback('INFO', tag.name, "Already data URI")
+ if scheme == "data":
+ callback("INFO", tag.name, "Already data URI")
else:
# htmlark can only get from http/https and local files
- callback('ERROR', tag.name, "Unknown protocol in URL: " + tag_url)
+ callback("ERROR", tag.name, "Unknown protocol in URL: " + tag_url)
if not ignore_errors:
raise
except NameError as e:
# Requests module is not available
- callback('ERROR', tag.name, str(e))
+ callback("ERROR", tag.name, str(e))
if not ignore_errors:
raise
else:
encoded_resource = make_data_uri(tag_mime, tag_data)
- if tag.name == 'link':
- tag['href'] = encoded_resource
+ if tag.name == "link":
+ tag["href"] = encoded_resource
else:
- tag['src'] = encoded_resource
- callback('INFO', tag.name, tag_url)
+ tag["src"] = encoded_resource
+ callback("INFO", tag.name, tag_url)
# Record the original URL so the original HTML can be recovered
tag.insert_after(bs4.Comment("URL:" + tag_url))
- soup.html.insert_after(bs4.Comment(
- "Generated by HTMLArk {}. Original URL {}".format(datetime.now(),
- page_path)))
+ soup.html.insert_after(
+ bs4.Comment(
+ "Generated by HTMLArk {}. Original URL {}".format(datetime.now(), page_path)
+ )
+ )
return str(soup)
def _get_options():
"""Parse command line options."""
- parser = argparse.ArgumentParser(description="""
+ parser = argparse.ArgumentParser(
+ description="""
Converts a webpage including external resources into a single HTML
file. Note that resources loaded with JavaScript will not be handled
- by this program, it will only work properly with static pages.""")
+ by this program, it will only work properly with static pages."""
+ )
# Can't make this an argparse.FileType, because it could be a local path
# or an URL, and convert_page needs the path
- parser.add_argument('webpage', nargs='?', default=None,
- help="""URL or path of webpage to convert. If not
- specified, read from STDIN.""")
- parser.add_argument('-o', '--output', default=sys.stdout,
- type=argparse.FileType('w', encoding='UTF-8'),
- help="File to write output. Defaults to STDOUT.")
- parser.add_argument('-E', '--ignore-errors', action='store_true', default=False,
- help="Ignores unreadable resources")
- parser.add_argument('-I', '--ignore-images', action='store_true', default=False,
- help="Ignores images during conversion")
- parser.add_argument('-C', '--ignore-css', action='store_true', default=False,
- help="Ignores stylesheets during conversion")
- parser.add_argument('-J', '--ignore-js', action='store_true', default=False,
- help="Ignores external JavaScript during conversion")
- parser.add_argument('-p', '--parser', default='auto',
- choices=['html.parser', 'lxml', 'html5lib', 'auto'],
- help="""Select HTML parser. Defaults to auto, which
+ parser.add_argument(
+ "webpage",
+ nargs="?",
+ default=None,
+ help="""URL or path of webpage to convert. If not
+ specified, read from STDIN.""",
+ )
+ parser.add_argument(
+ "-o",
+ "--output",
+ default=sys.stdout,
+ type=argparse.FileType("w", encoding="UTF-8"),
+ help="File to write output. Defaults to STDOUT.",
+ )
+ parser.add_argument(
+ "-E",
+ "--ignore-errors",
+ action="store_true",
+ default=False,
+ help="Ignores unreadable resources",
+ )
+ parser.add_argument(
+ "-I",
+ "--ignore-images",
+ action="store_true",
+ default=False,
+ help="Ignores images during conversion",
+ )
+ parser.add_argument(
+ "-C",
+ "--ignore-css",
+ action="store_true",
+ default=False,
+ help="Ignores stylesheets during conversion",
+ )
+ parser.add_argument(
+ "-J",
+ "--ignore-js",
+ action="store_true",
+ default=False,
+ help="Ignores external JavaScript during conversion",
+ )
+ parser.add_argument(
+ "-p",
+ "--parser",
+ default="auto",
+ choices=["html.parser", "lxml", "html5lib", "auto"],
+ help="""Select HTML parser. Defaults to auto, which
tries to use lxml, html5lib, and html.parser
in that order. See documentation for more
- information.""")
- parser.add_argument('--list-parsers', action='store_true', default=False,
- help="Lists installed parsers available to HTMLArk")
- parser.add_argument('-v', '--verbose', action='store_true', default=False,
- help="Prints information during conversion")
- parser.add_argument('-V', '--version', action='version',
- version="HTMLArk v{}".format(__version__),
- help="Displays version information")
+ information.""",
+ )
+ parser.add_argument(
+ "--list-parsers",
+ action="store_true",
+ default=False,
+ help="Lists installed parsers available to HTMLArk",
+ )
+ parser.add_argument(
+ "-v",
+ "--verbose",
+ action="store_true",
+ default=False,
+ help="Prints information during conversion",
+ )
+ parser.add_argument(
+ "-V",
+ "--version",
+ action="version",
+ version="HTMLArk v{}".format(__version__),
+ help="Displays version information",
+ )
parsed = parser.parse_args()
if parsed.list_parsers:
@@ -296,31 +365,37 @@ def _main():
# All further messages should use print_verbose() or print_error()
def print_error(m):
print(m, file=sys.stderr)
+
# print_error = lambda m: print(m, file=sys.stderr)
if options.verbose:
print_verbose = print_error
else:
+
def print_verbose(_):
pass
def info_callback(severity, message_type, message_data):
"""Display progress information during conversion."""
- if message_type == 'img':
+ if message_type == "img":
tagtype = "Image"
- elif message_type == 'link':
+ elif message_type == "link":
tagtype = "CSS"
- elif message_type == 'script':
+ elif message_type == "script":
tagtype = "JS"
else:
tagtype = message_type
# Only display info messages if -v/--verbose flag is set
- if severity == 'INFO':
+ if severity == "INFO":
if options.verbose:
print_verbose("{}: {}".format(tagtype, message_data))
- elif severity == 'ERROR':
+ elif severity == "ERROR":
print_error("{}: {}".format(tagtype, message_data))
else:
- print_error("Unknown message level {}, please tell the author of the program".format(severity))
+ print_error(
+ "Unknown message level {}, please tell the author of the program".format(
+ severity
+ )
+ )
print_error("{}: {}".format(tagtype, message_data))
# Convert page
@@ -330,13 +405,15 @@ def info_callback(severity, message_type, message_data):
print_verbose("Processing {}".format(options.webpage))
try:
- newhtml = convert_page(options.webpage,
- parser=options.parser,
- ignore_errors=options.ignore_errors,
- ignore_images=options.ignore_images,
- ignore_css=options.ignore_css,
- ignore_js=options.ignore_js,
- callback=info_callback)
+ newhtml = convert_page(
+ options.webpage,
+ parser=options.parser,
+ ignore_errors=options.ignore_errors,
+ ignore_images=options.ignore_images,
+ ignore_css=options.ignore_css,
+ ignore_js=options.ignore_js,
+ callback=info_callback,
+ )
except (OSError, RequestException, ValueError) as e:
sys.exit("Unable to convert webpage: {}".format(e))
except NameError: