From 69e014f9603bbad578933179209ad9befe428dc1 Mon Sep 17 00:00:00 2001 From: Ethan Date: Thu, 30 Apr 2020 15:12:39 +0800 Subject: [PATCH 1/2] black & Window's path suported & decode binarary htmltext using utf8 --- htmlark.py | 234 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 155 insertions(+), 79 deletions(-) diff --git a/htmlark.py b/htmlark.py index 02c2918..405741e 100755 --- a/htmlark.py +++ b/htmlark.py @@ -2,6 +2,9 @@ """Embed images, CSS, and JavaScript into an HTML file, using data URIs.""" __version__ = "1.0.0" +import os +import string +import pathlib import argparse import base64 from datetime import datetime @@ -13,6 +16,7 @@ from urllib.parse import urlparse import bs4 + # Import requests if available, dummy it if not try: from requests import get as requests_get @@ -22,9 +26,11 @@ class RequestException(Exception): # NOQA make flake8 shut up """Dummy exception for when Requests is not installed.""" + pass -PARSERS = ['lxml', 'html5lib', 'html.parser'] + +PARSERS = ["lxml", "html5lib", "html.parser"] def get_available_parsers(): @@ -53,23 +59,24 @@ def _get_resource(resource_url: str) -> (str, bytes): ValueError: If ``resource_url``'s protocol is invalid. """ url_parsed = urlparse(resource_url) - if url_parsed.scheme in ['http', 'https']: + if url_parsed.scheme in ["http", "https"]: # Requests might not be installed if requests_get is not None: request = requests_get(resource_url) data = request.content - if 'Content-Type' in request.headers: - mimetype = request.headers['Content-Type'] + if "Content-Type" in request.headers: + mimetype = request.headers["Content-Type"] else: mimetype = mimetypes.guess_type(resource_url) else: raise NameError("HTTP URL found but requests not available") - elif url_parsed.scheme == '': + elif url_parsed.scheme in string.ascii_letters: # '' is local file - with open(resource_url, 'rb') as f: + # null-string("") is in any string + with open(resource_url, "rb") as f: data = f.read() mimetype, _ = mimetypes.guess_type(resource_url) - elif url_parsed.scheme == 'data': + elif url_parsed.scheme == "data": raise ValueError("Resource path is a data URI", url_parsed.scheme) else: raise ValueError("Not local path or HTTP/HTTPS URL", url_parsed.scheme) @@ -88,20 +95,25 @@ def make_data_uri(mimetype: str, data: bytes) -> str: Returns: str: Input data encoded into a data URI. """ - mimetype = '' if mimetype is None else mimetype - if mimetype in ['', 'text/css', 'application/javascript']: + mimetype = "" if mimetype is None else mimetype + if mimetype in ["", "text/css", "application/javascript"]: # Text data can simply be URL-encoded encoded_data = quote(data.decode()) else: - mimetype = mimetype + ';base64' + mimetype = mimetype + ";base64" encoded_data = base64.b64encode(data).decode() return "data:{},{}".format(mimetype, encoded_data) -def convert_page(page_path: str, parser: str='auto', - callback: Callable[[str, str, str], None]=lambda *_: None, - ignore_errors: bool=False, ignore_images: bool=False, - ignore_css: bool=False, ignore_js: bool=False) -> str: +def convert_page( + page_path: str, + parser: str = "auto", + callback: Callable[[str, str, str], None] = lambda *_: None, + ignore_errors: bool = False, + ignore_images: bool = False, + ignore_css: bool = False, + ignore_js: bool = False, +) -> str: """Take an HTML file or URL and outputs new HTML with resources as data URIs. Parameters: @@ -165,7 +177,7 @@ def convert_page(page_path: str, parser: str='auto', """ # Check features if requests_get is None: - callback('INFO', 'feature', "Requests not available, web downloading disabled") + callback("INFO", "feature", "Requests not available, web downloading disabled") # Get page HTML, whether from a server, a local file, or stdin if page_path is None: @@ -176,109 +188,165 @@ def convert_page(page_path: str, parser: str='auto', # Not all parsers are equal - it can be specified on the command line # so the user can try another when one fails - if parser == 'auto': + if parser == "auto": parser = get_available_parsers()[0] - soup = bs4.BeautifulSoup(page_text, parser) - callback('INFO', 'parser', "Using parser " + parser) + soup = bs4.BeautifulSoup(page_text.decode("utf-8"), parser) + callback("INFO", "parser", "Using parser " + parser) tags = [] # Gather all the relevant tags together if not ignore_images: - tags += soup('img') + tags += soup("img") if not ignore_css: - csstags = soup('link') + csstags = soup("link") for css in csstags: - if 'stylesheet' in css['rel']: + if "stylesheet" in css["rel"]: tags.append(css) if not ignore_js: - scripttags = soup('script') + scripttags = soup("script") for script in scripttags: - if 'src' in script.attrs: + if "src" in script.attrs: tags.append(script) # Convert the linked resources for tag in tags: - tag_url = tag['href'] if tag.name == 'link' else tag['src'] + tag_url = tag["href"] if tag.name == "link" else tag["src"] try: - # BUG: doesn't work if using relative remote URLs in a local file - fullpath = urljoin(page_path, tag_url) + if not bool(urlparse(page_path).netloc): + dir_path = pathlib.Path(page_path).parent.absolute() + fullpath = os.path.join(dir_path, tag_url) + else: + fullpath = urljoin(page_path, tag_url) tag_mime, tag_data = _get_resource(fullpath) except RequestException: - callback('ERROR', tag.name, "Can't access URL " + fullpath) + callback("ERROR", tag.name, "Can't access URL " + fullpath) if not ignore_errors: raise except OSError as e: - callback('ERROR', tag.name, "Error reading '{}': {}".format(e.filename, e.strerror)) + callback( + "ERROR", + tag.name, + "Error reading '{}': {}".format(e.filename, e.strerror), + ) if not ignore_errors: raise except ValueError as e: # Raised when a problem with the URL is found scheme = e.args[1] # Don't need to process things that are already data URIs - if scheme == 'data': - callback('INFO', tag.name, "Already data URI") + if scheme == "data": + callback("INFO", tag.name, "Already data URI") else: # htmlark can only get from http/https and local files - callback('ERROR', tag.name, "Unknown protocol in URL: " + tag_url) + callback("ERROR", tag.name, "Unknown protocol in URL: " + tag_url) if not ignore_errors: raise except NameError as e: # Requests module is not available - callback('ERROR', tag.name, str(e)) + callback("ERROR", tag.name, str(e)) if not ignore_errors: raise else: encoded_resource = make_data_uri(tag_mime, tag_data) - if tag.name == 'link': - tag['href'] = encoded_resource + if tag.name == "link": + tag["href"] = encoded_resource else: - tag['src'] = encoded_resource - callback('INFO', tag.name, tag_url) + tag["src"] = encoded_resource + callback("INFO", tag.name, tag_url) # Record the original URL so the original HTML can be recovered tag.insert_after(bs4.Comment("URL:" + tag_url)) - soup.html.insert_after(bs4.Comment( - "Generated by HTMLArk {}. Original URL {}".format(datetime.now(), - page_path))) + soup.html.insert_after( + bs4.Comment( + "Generated by HTMLArk {}. Original URL {}".format(datetime.now(), page_path) + ) + ) return str(soup) def _get_options(): """Parse command line options.""" - parser = argparse.ArgumentParser(description=""" + parser = argparse.ArgumentParser( + description=""" Converts a webpage including external resources into a single HTML file. Note that resources loaded with JavaScript will not be handled - by this program, it will only work properly with static pages.""") + by this program, it will only work properly with static pages.""" + ) # Can't make this an argparse.FileType, because it could be a local path # or an URL, and convert_page needs the path - parser.add_argument('webpage', nargs='?', default=None, - help="""URL or path of webpage to convert. If not - specified, read from STDIN.""") - parser.add_argument('-o', '--output', default=sys.stdout, - type=argparse.FileType('w', encoding='UTF-8'), - help="File to write output. Defaults to STDOUT.") - parser.add_argument('-E', '--ignore-errors', action='store_true', default=False, - help="Ignores unreadable resources") - parser.add_argument('-I', '--ignore-images', action='store_true', default=False, - help="Ignores images during conversion") - parser.add_argument('-C', '--ignore-css', action='store_true', default=False, - help="Ignores stylesheets during conversion") - parser.add_argument('-J', '--ignore-js', action='store_true', default=False, - help="Ignores external JavaScript during conversion") - parser.add_argument('-p', '--parser', default='auto', - choices=['html.parser', 'lxml', 'html5lib', 'auto'], - help="""Select HTML parser. Defaults to auto, which + parser.add_argument( + "webpage", + nargs="?", + default=None, + help="""URL or path of webpage to convert. If not + specified, read from STDIN.""", + ) + parser.add_argument( + "-o", + "--output", + default=sys.stdout, + type=argparse.FileType("w", encoding="UTF-8"), + help="File to write output. Defaults to STDOUT.", + ) + parser.add_argument( + "-E", + "--ignore-errors", + action="store_true", + default=False, + help="Ignores unreadable resources", + ) + parser.add_argument( + "-I", + "--ignore-images", + action="store_true", + default=False, + help="Ignores images during conversion", + ) + parser.add_argument( + "-C", + "--ignore-css", + action="store_true", + default=False, + help="Ignores stylesheets during conversion", + ) + parser.add_argument( + "-J", + "--ignore-js", + action="store_true", + default=False, + help="Ignores external JavaScript during conversion", + ) + parser.add_argument( + "-p", + "--parser", + default="auto", + choices=["html.parser", "lxml", "html5lib", "auto"], + help="""Select HTML parser. Defaults to auto, which tries to use lxml, html5lib, and html.parser in that order. See documentation for more - information.""") - parser.add_argument('--list-parsers', action='store_true', default=False, - help="Lists installed parsers available to HTMLArk") - parser.add_argument('-v', '--verbose', action='store_true', default=False, - help="Prints information during conversion") - parser.add_argument('-V', '--version', action='version', - version="HTMLArk v{}".format(__version__), - help="Displays version information") + information.""", + ) + parser.add_argument( + "--list-parsers", + action="store_true", + default=False, + help="Lists installed parsers available to HTMLArk", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + default=False, + help="Prints information during conversion", + ) + parser.add_argument( + "-V", + "--version", + action="version", + version="HTMLArk v{}".format(__version__), + help="Displays version information", + ) parsed = parser.parse_args() if parsed.list_parsers: @@ -296,31 +364,37 @@ def _main(): # All further messages should use print_verbose() or print_error() def print_error(m): print(m, file=sys.stderr) + # print_error = lambda m: print(m, file=sys.stderr) if options.verbose: print_verbose = print_error else: + def print_verbose(_): pass def info_callback(severity, message_type, message_data): """Display progress information during conversion.""" - if message_type == 'img': + if message_type == "img": tagtype = "Image" - elif message_type == 'link': + elif message_type == "link": tagtype = "CSS" - elif message_type == 'script': + elif message_type == "script": tagtype = "JS" else: tagtype = message_type # Only display info messages if -v/--verbose flag is set - if severity == 'INFO': + if severity == "INFO": if options.verbose: print_verbose("{}: {}".format(tagtype, message_data)) - elif severity == 'ERROR': + elif severity == "ERROR": print_error("{}: {}".format(tagtype, message_data)) else: - print_error("Unknown message level {}, please tell the author of the program".format(severity)) + print_error( + "Unknown message level {}, please tell the author of the program".format( + severity + ) + ) print_error("{}: {}".format(tagtype, message_data)) # Convert page @@ -330,13 +404,15 @@ def info_callback(severity, message_type, message_data): print_verbose("Processing {}".format(options.webpage)) try: - newhtml = convert_page(options.webpage, - parser=options.parser, - ignore_errors=options.ignore_errors, - ignore_images=options.ignore_images, - ignore_css=options.ignore_css, - ignore_js=options.ignore_js, - callback=info_callback) + newhtml = convert_page( + options.webpage, + parser=options.parser, + ignore_errors=options.ignore_errors, + ignore_images=options.ignore_images, + ignore_css=options.ignore_css, + ignore_js=options.ignore_js, + callback=info_callback, + ) except (OSError, RequestException, ValueError) as e: sys.exit("Unable to convert webpage: {}".format(e)) except NameError: From b46c25b1f4335b8c519d132fd1a5640d4f16233e Mon Sep 17 00:00:00 2001 From: Ethan Date: Thu, 7 May 2020 10:04:59 +0800 Subject: [PATCH 2/2] charset as arg --- htmlark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/htmlark.py b/htmlark.py index 405741e..e1d09ff 100755 --- a/htmlark.py +++ b/htmlark.py @@ -113,6 +113,7 @@ def convert_page( ignore_images: bool = False, ignore_css: bool = False, ignore_js: bool = False, + charset: str = "utf-8", ) -> str: """Take an HTML file or URL and outputs new HTML with resources as data URIs. @@ -190,7 +191,7 @@ def convert_page( # so the user can try another when one fails if parser == "auto": parser = get_available_parsers()[0] - soup = bs4.BeautifulSoup(page_text.decode("utf-8"), parser) + soup = bs4.BeautifulSoup(page_text.decode(charset), parser) callback("INFO", "parser", "Using parser " + parser) tags = []