From 5bfd7d8695a546c907548fd40512ef6544785e5d Mon Sep 17 00:00:00 2001 From: TheAMM Date: Thu, 31 Oct 2019 21:57:27 +0200 Subject: [PATCH 1/2] Introduce baked queries SQA's baked queries prepares the queries in advance, caching yada yada. Makes thing a bit faster. Also bigger speedup included is a shoddy cache for the total torrent count (only applied to baked queries currently). Caching the value for a few dozen seconds shaves off some wasted time, as it's mostly just used for pagination. --- config.example.py | 7 + nyaa/search.py | 307 ++++++++++++++++++++++++++++++++++++++++++++ nyaa/views/main.py | 8 +- nyaa/views/users.py | 7 +- 4 files changed, 325 insertions(+), 4 deletions(-) diff --git a/config.example.py b/config.example.py index 304e22906..d28377694 100644 --- a/config.example.py +++ b/config.example.py @@ -156,6 +156,13 @@ # How many pages we'll return at most MAX_PAGES = 100 +# How long and how many entries to cache for count queries +COUNT_CACHE_SIZE = 256 +COUNT_CACHE_DURATION = 30 + +# Use baked queries for database search +USE_BAKED_SEARCH = False + # Use better searching with ElasticSearch # See README.MD on setup! USE_ELASTIC_SEARCH = False diff --git a/nyaa/search.py b/nyaa/search.py index a1700c7f1..f6932afe1 100644 --- a/nyaa/search.py +++ b/nyaa/search.py @@ -1,13 +1,17 @@ import math import re import shlex +import threading +import time import flask +from flask_sqlalchemy import Pagination import sqlalchemy import sqlalchemy_fulltext.modes as FullTextMode from elasticsearch import Elasticsearch from elasticsearch_dsl import Q, Search +from sqlalchemy.ext import baked from sqlalchemy_fulltext import FullTextSearch from nyaa import models @@ -531,3 +535,306 @@ def search_db(term='', user=None, sort='id', order='desc', category='0_0', max_page=MAX_PAGES) return query + + +# Baked queries follow + +class BakedPair(object): + def __init__(self, *items): + self.items = list(items) + + def __iadd__(self, other): + for item in self.items: + item += other + + return self + + +bakery = baked.bakery() + + +BAKED_SORT_KEYS = { + 'id': models.Torrent.id, + 'size': models.Torrent.filesize, + 'comments': models.Torrent.comment_count, + 'seeders': models.Statistic.seed_count, + 'leechers': models.Statistic.leech_count, + 'downloads': models.Statistic.download_count +} + +BAKED_SORT_LAMBDAS = { + 'id-asc': lambda q: q.order_by(models.Torrent.id.asc()), + 'id-desc': lambda q: q.order_by(models.Torrent.id.desc()), + + 'size-asc': lambda q: q.order_by(models.Torrent.filesize.asc()), + 'size-desc': lambda q: q.order_by(models.Torrent.filesize.desc()), + + 'comments-asc': lambda q: q.order_by(models.Torrent.comment_count.asc()), + 'comments-desc': lambda q: q.order_by(models.Torrent.comment_count.desc()), + + # This is a bit stupid, but programmatically generating these mixed up the baked keys, so deal. + 'seeders-asc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_seed_count)' + ).order_by(models.Statistic.seed_count.asc(), models.Torrent.id.asc()), + 'seeders-desc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_seed_count)' + ).order_by(models.Statistic.seed_count.desc(), models.Torrent.id.desc()), + + 'leechers-asc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_leech_count)' + ).order_by(models.Statistic.leech_count.asc(), models.Torrent.id.asc()), + 'leechers-desc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_leech_count)' + ).order_by(models.Statistic.leech_count.desc(), models.Torrent.id.desc()), + + 'downloads-asc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_download_count)' + ).order_by(models.Statistic.download_count.asc(), models.Torrent.id.asc()), + 'downloads-desc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_download_count)' + ).order_by(models.Statistic.download_count.desc(), models.Torrent.id.desc()), +} + + +BAKED_FILTER_LAMBDAS = { + '0': None, + '1': lambda q: ( + q.filter(models.Torrent.flags.op('&')(models.TorrentFlags.REMAKE.value).is_(False)) + ), + '2': lambda q: ( + q.filter(models.Torrent.flags.op('&')(models.TorrentFlags.TRUSTED.value).is_(True)) + ), + '3': lambda q: ( + q.filter(models.Torrent.flags.op('&')(models.TorrentFlags.COMPLETE.value).is_(True)) + ), +} + + +def search_db_baked(term='', user=None, sort='id', order='desc', category='0_0', + quality_filter='0', page=1, rss=False, admin=False, + logged_in_user=None, per_page=75): + if page > 4294967295: + flask.abort(404) + + MAX_PAGES = app.config.get("MAX_PAGES", 0) + + if MAX_PAGES and page > MAX_PAGES: + flask.abort(flask.Response("You've exceeded the maximum number of pages. Please " + "make your search query less broad.", 403)) + + sort_lambda = BAKED_SORT_LAMBDAS.get('{}-{}'.format(sort, order).lower()) + if not sort_lambda: + flask.abort(400) + + sentinel = object() + filter_lambda = BAKED_FILTER_LAMBDAS.get(quality_filter.lower(), sentinel) + if filter_lambda is sentinel: + flask.abort(400) + + if user: + user = models.User.by_id(user) + if not user: + flask.abort(404) + user = user.id + + main_cat_id = 0 + sub_cat_id = 0 + + if category: + cat_match = re.match(r'^(\d+)_(\d+)$', category) + if not cat_match: + flask.abort(400) + + main_cat_id = int(cat_match.group(1)) + sub_cat_id = int(cat_match.group(2)) + + if main_cat_id > 0: + if sub_cat_id > 0: + sub_category = models.SubCategory.by_category_ids(main_cat_id, sub_cat_id) + if not sub_category: + flask.abort(400) + else: + main_category = models.MainCategory.by_id(main_cat_id) + if not main_category: + flask.abort(400) + + # Force sort by id desc if rss + if rss: + sort_lambda = BAKED_SORT_LAMBDAS['id-desc'] + + same_user = False + if logged_in_user: + same_user = logged_in_user.id == user + + if term: + query = bakery(lambda session: session.query(models.TorrentNameSearch)) + count_query = bakery(lambda session: session.query( + sqlalchemy.func.count(models.TorrentNameSearch.id))) + else: + query = bakery(lambda session: session.query(models.Torrent)) + # This is... eh. Optimize the COUNT() query since MySQL is bad at that. + # See http://docs.sqlalchemy.org/en/rel_1_1/orm/query.html#sqlalchemy.orm.query.Query.count + # Wrap the queries into the helper class to deduplicate code and + # apply filters to both in one go + count_query = bakery(lambda session: session.query( + sqlalchemy.func.count(models.Torrent.id))) + + qpc = BakedPair(query, count_query) + bp = sqlalchemy.bindparam + + baked_params = {} + + # User view (/user/username) + if user: + qpc += lambda q: q.filter(models.Torrent.uploader_id == bp('user')) + baked_params['user'] = user + + if not admin: + # Hide all DELETED torrents if regular user + qpc += lambda q: q.filter(models.Torrent.flags.op('&') + (int(models.TorrentFlags.DELETED)).is_(False)) + # If logged in user is not the same as the user being viewed, + # show only torrents that aren't hidden or anonymous + # + # If logged in user is the same as the user being viewed, + # show all torrents including hidden and anonymous ones + # + # On RSS pages in user view, + # show only torrents that aren't hidden or anonymous no matter what + if not same_user or rss: + qpc += lambda q: ( + q.filter( + models.Torrent.flags.op('&')( + int(models.TorrentFlags.HIDDEN | models.TorrentFlags.ANONYMOUS) + ).is_(False) + ) + ) + # General view (homepage, general search view) + else: + if not admin: + # Hide all DELETED torrents if regular user + qpc += lambda q: q.filter(models.Torrent.flags.op('&') + (int(models.TorrentFlags.DELETED)).is_(False)) + # If logged in, show all torrents that aren't hidden unless they belong to you + # On RSS pages, show all public torrents and nothing more. + if logged_in_user and not rss: + qpc += lambda q: q.filter( + (models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN)).is_(False)) | + (models.Torrent.uploader_id == bp('logged_in_user')) + ) + baked_params['logged_in_user'] = logged_in_user + # Otherwise, show all torrents that aren't hidden + else: + qpc += lambda q: q.filter(models.Torrent.flags.op('&') + (int(models.TorrentFlags.HIDDEN)).is_(False)) + + if sub_cat_id: + qpc += lambda q: q.filter( + (models.Torrent.main_category_id == bp('main_cat_id')), + (models.Torrent.sub_category_id == bp('sub_cat_id')) + ) + baked_params['main_cat_id'] = main_cat_id + baked_params['sub_cat_id'] = sub_cat_id + elif main_cat_id: + qpc += lambda q: q.filter(models.Torrent.main_category_id == bp('main_cat_id')) + baked_params['main_cat_id'] = main_cat_id + + if filter_lambda: + qpc += filter_lambda + + if term: + raise Exception('Baked search does not support search terms') + + # Sort and order + query += sort_lambda + + if rss: + query += lambda q: q.limit(bp('per_page')) + baked_params['per_page'] = per_page + + return query(db.session()).params(**baked_params).all() + + return baked_paginate(query, count_query, baked_params, + page, per_page=per_page, step=5, max_page=MAX_PAGES) + + +class ShoddyLRU(object): + def __init__(self, max_entries=128, expiry=60): + self.max_entries = max_entries + self.expiry = expiry + + # Contains [value, last_used, expires_at] + self.entries = {} + self._lock = threading.Lock() + + self._sentinel = object() + + def get(self, key, default=None): + entry = self.entries.get(key) + if entry is None: + return default + + now = time.time() + if now > entry[2]: + with self._lock: + del self.entries[key] + return default + + entry[1] = now + return entry[0] + + def put(self, key, value, expiry=None): + with self._lock: + overflow = len(self.entries) - self.max_entries + if overflow > 0: + # Pick the least recently used keys + removed_keys = [key for key, value in sorted( + self.entries.items(), key=lambda t:t[1][1])][:overflow] + for key in removed_keys: + del self.entries[key] + + now = time.time() + self.entries[key] = [value, now, now + (expiry or self.expiry)] + + +LRU_CACHE = ShoddyLRU(256, 60) + + +def baked_paginate(query, count_query, params, page=1, per_page=50, max_page=None, step=5): + if page < 1: + flask.abort(404) + + if max_page and page > max_page: + flask.abort(404) + bp = sqlalchemy.bindparam + + ses = db.session() + + # Count all items, use cache + if app.config['COUNT_CACHE_DURATION']: + query_key = (count_query._effective_key(ses), tuple(sorted(params.items()))) + total_query_count = LRU_CACHE.get(query_key) + if total_query_count is None: + total_query_count = count_query(ses).params(**params).scalar() + LRU_CACHE.put(query_key, total_query_count, expiry=app.config['COUNT_CACHE_DURATION']) + else: + total_query_count = count_query(ses).params(**params).scalar() + + # Grab items on current page + query += lambda q: q.limit(bp('limit')).offset(bp('offset')) + params['limit'] = per_page + params['offset'] = (page - 1) * per_page + + res = query(ses).params(**params) + items = res.all() + + if max_page: + total_query_count = min(total_query_count, max_page * per_page) + + # Handle case where we've had no results but then have some while in cache + total_query_count = max(total_query_count, len(items)) + + if not items and page != 1: + flask.abort(404) + + return Pagination(None, page, per_page, total_query_count, items) diff --git a/nyaa/views/main.py b/nyaa/views/main.py index 8dfe38f88..d6f9a7292 100644 --- a/nyaa/views/main.py +++ b/nyaa/views/main.py @@ -10,7 +10,7 @@ from nyaa import models from nyaa.extensions import db from nyaa.search import (DEFAULT_MAX_SEARCH_RESULT, DEFAULT_PER_PAGE, SERACH_PAGINATE_DISPLAY_MSG, - _generate_query_string, search_db, search_elastic) + _generate_query_string, search_db, search_db_baked, search_elastic) from nyaa.utils import chain_get from nyaa.views.account import logout @@ -186,7 +186,11 @@ def home(rss): else: # Otherwise, use db search for everything query_args['term'] = search_term or '' - query = search_db(**query_args) + if app.config['USE_BAKED_SEARCH']: + query = search_db_baked(**query_args) + else: + query = search_db(**query_args) + if render_as_rss: return render_rss('Home', query, use_elastic=False, magnet_links=use_magnet_links) else: diff --git a/nyaa/views/users.py b/nyaa/views/users.py index 87ca47122..decf3f6f5 100644 --- a/nyaa/views/users.py +++ b/nyaa/views/users.py @@ -12,7 +12,7 @@ from nyaa import forms, models from nyaa.extensions import db from nyaa.search import (DEFAULT_MAX_SEARCH_RESULT, DEFAULT_PER_PAGE, SERACH_PAGINATE_DISPLAY_MSG, - _generate_query_string, search_db, search_elastic) + _generate_query_string, search_db, search_db_baked, search_elastic) from nyaa.utils import admin_only, chain_get, sha1_hash app = flask.current_app @@ -185,7 +185,10 @@ def view_user(user_name): query_args['term'] = '' else: query_args['term'] = search_term or '' - query = search_db(**query_args) + if app.config['USE_BAKED_SEARCH']: + query = search_db_baked(**query_args) + else: + query = search_db(**query_args) return flask.render_template('user.html', use_elastic=False, torrent_query=query, From c655463078e4bd295ffdb2349fbff9f080589934 Mon Sep 17 00:00:00 2001 From: TheAMM Date: Thu, 31 Oct 2019 22:56:47 +0200 Subject: [PATCH 2/2] search: Allow specifying multiple usernames... ...by repeating &u=user in the GET parameters. No UX for this yet. Reworks the RSS URL generator to fit with duplicated keys. --- nyaa/search.py | 64 ++++++++++++++------------------------ nyaa/templates/layout.html | 4 +-- nyaa/views/main.py | 24 ++++++++------ nyaa/views/users.py | 4 +-- 4 files changed, 41 insertions(+), 55 deletions(-) diff --git a/nyaa/search.py b/nyaa/search.py index f6932afe1..7ce544581 100644 --- a/nyaa/search.py +++ b/nyaa/search.py @@ -3,6 +3,7 @@ import shlex import threading import time +from urllib.parse import quote, urlencode import flask from flask_sqlalchemy import Pagination @@ -60,17 +61,18 @@ def _get_index_name(column): return table_indexes.get(column.name) -def _generate_query_string(term, category, filter, user): - params = {} +def _generate_query_string(term, category, filter, user_names): + params = [] if term: - params['q'] = str(term) + params.append(('q', str(term))) if category: - params['c'] = str(category) + params.append(('c', str(category))) if filter: - params['f'] = str(filter) - if user: - params['u'] = str(user) - return params + params.append(('f', str(filter))) + for name in user_names: + params.append(('u', name)) + + return urlencode(params, quote_via=quote) # For preprocessing ES search terms in _parse_es_search_terms @@ -181,7 +183,7 @@ def must_matcher(match): return search -def search_elastic(term='', user=None, sort='id', order='desc', +def search_elastic(term='', user_ids=None, sort='id', order='desc', category='0_0', quality_filter='0', page=1, rss=False, admin=False, logged_in_user=None, per_page=75, max_search_results=1000): @@ -261,17 +263,9 @@ def search_elastic(term='', user=None, sort='id', order='desc', if not main_category: flask.abort(400) - # This might be useless since we validate users - # before coming into this method, but just to be safe... - if user: - user = models.User.by_id(user) - if not user: - flask.abort(404) - user = user.id - same_user = False if logged_in_user: - same_user = user == logged_in_user.id + same_user = len(user_ids) == 1 and logged_in_user.id in user_ids s = Search(using=es_client, index=app.config.get('ES_INDEX_NAME')) # todo, sukebei prefix @@ -281,8 +275,8 @@ def search_elastic(term='', user=None, sort='id', order='desc', s = _parse_es_search_terms(s, term) # User view (/user/username) - if user: - s = s.filter('term', uploader_id=user) + if user_ids: + s = s.filter('terms', uploader_id=user_ids) if not admin: # Hide all DELETED torrents if regular user @@ -370,7 +364,7 @@ def wrapper(*args, **kwargs): return wrapper -def search_db(term='', user=None, sort='id', order='desc', category='0_0', +def search_db(term='', user_ids=None, sort='id', order='desc', category='0_0', quality_filter='0', page=1, rss=False, admin=False, logged_in_user=None, per_page=75): if page > 4294967295: @@ -380,7 +374,7 @@ def search_db(term='', user=None, sort='id', order='desc', category='0_0', same_user = False if logged_in_user: - same_user = logged_in_user.id == user + same_user = len(user_ids) == 1 and logged_in_user.id in user_ids # Logged in users should always be able to view their full listing. if same_user or admin: @@ -426,12 +420,6 @@ def search_db(term='', user=None, sort='id', order='desc', category='0_0', if filter_tuple is sentinel: flask.abort(400) - if user: - user = models.User.by_id(user) - if not user: - flask.abort(404) - user = user.id - main_category = None sub_category = None main_cat_id = 0 @@ -469,8 +457,8 @@ def search_db(term='', user=None, sort='id', order='desc', category='0_0', qpc = QueryPairCaller(query, count_query) # User view (/user/username) - if user: - qpc.filter(models.Torrent.uploader_id == user) + if user_ids: + qpc.filter(models.Torrent.uploader_id.in_(user_ids)) if not admin: # Hide all DELETED torrents if regular user @@ -610,7 +598,7 @@ def __iadd__(self, other): } -def search_db_baked(term='', user=None, sort='id', order='desc', category='0_0', +def search_db_baked(term='', user_ids=None, sort='id', order='desc', category='0_0', quality_filter='0', page=1, rss=False, admin=False, logged_in_user=None, per_page=75): if page > 4294967295: @@ -631,12 +619,6 @@ def search_db_baked(term='', user=None, sort='id', order='desc', category='0_0', if filter_lambda is sentinel: flask.abort(400) - if user: - user = models.User.by_id(user) - if not user: - flask.abort(404) - user = user.id - main_cat_id = 0 sub_cat_id = 0 @@ -664,7 +646,7 @@ def search_db_baked(term='', user=None, sort='id', order='desc', category='0_0', same_user = False if logged_in_user: - same_user = logged_in_user.id == user + same_user = len(user_ids) == 1 and logged_in_user.id in user_ids if term: query = bakery(lambda session: session.query(models.TorrentNameSearch)) @@ -685,9 +667,9 @@ def search_db_baked(term='', user=None, sort='id', order='desc', category='0_0', baked_params = {} # User view (/user/username) - if user: - qpc += lambda q: q.filter(models.Torrent.uploader_id == bp('user')) - baked_params['user'] = user + if user_ids: + qpc += lambda q: q.filter(models.Torrent.uploader_id.in_(bp('user_ids', expanding=True))) + baked_params['user_ids'] = user_ids if not admin: # Hide all DELETED torrents if regular user diff --git a/nyaa/templates/layout.html b/nyaa/templates/layout.html index 029b1e321..30179b6a5 100644 --- a/nyaa/templates/layout.html +++ b/nyaa/templates/layout.html @@ -9,7 +9,7 @@ - + @@ -93,7 +93,7 @@
  • Trusted
  • -
  • RSS
  • +
  • RSS
  • {% if config.SITE_FLAVOR == 'nyaa' %}
  • Fap
  • {% elif config.SITE_FLAVOR == 'sukebei' %} diff --git a/nyaa/views/main.py b/nyaa/views/main.py index d6f9a7292..d6c0f47c2 100644 --- a/nyaa/views/main.py +++ b/nyaa/views/main.py @@ -76,7 +76,7 @@ def home(rss): category = chain_get(req_args, 'c', 'cats') quality_filter = chain_get(req_args, 'f', 'filter') - user_name = chain_get(req_args, 'u', 'user') + user_names = set(req_args.getlist('u') + req_args.getlist('user')) page_number = chain_get(req_args, 'p', 'page', 'offset') try: page_number = max(1, int(page_number)) @@ -88,12 +88,16 @@ def home(rss): results_per_page = app.config.get('RESULTS_PER_PAGE', DEFAULT_PER_PAGE) - user_id = None - if user_name: - user = models.User.by_username(user_name) - if not user: + user_ids = [] + if user_names: + for name in user_names: + user = models.User.by_username(name) + if user: + user_ids.append(user.id) + # If we have usernames to look up but find none, 404 + if not user_ids: flask.abort(404) - user_id = user.id + user_ids = tuple(user_ids) special_results = { 'first_word_user': None, @@ -101,7 +105,7 @@ def home(rss): 'infohash_torrent': None } # Add advanced features to searches (but not RSS or user searches) - if search_term and not render_as_rss and not user_id: + if search_term and not render_as_rss and not user_ids: # Check if the first word of the search is an existing user user_word_match = re.match(r'^([a-zA-Z0-9_-]+) *(.*|$)', search_term) if user_word_match: @@ -122,7 +126,7 @@ def home(rss): special_results['infohash_torrent'] = matched_torrent query_args = { - 'user': user_id, + 'user_ids': user_ids, 'sort': sort_key or 'id', 'order': sort_order or 'desc', 'category': category or '0_0', @@ -166,7 +170,7 @@ def home(rss): use_elastic=True, magnet_links=use_magnet_links) else: rss_query_string = _generate_query_string( - search_term, category, quality_filter, user_name) + search_term, category, quality_filter, user_names) max_results = min(max_search_results, query_results['hits']['total']) # change p= argument to whatever you change page_parameter to or pagination breaks pagination = Pagination(p=query_args['page'], per_page=results_per_page, @@ -195,7 +199,7 @@ def home(rss): return render_rss('Home', query, use_elastic=False, magnet_links=use_magnet_links) else: rss_query_string = _generate_query_string( - search_term, category, quality_filter, user_name) + search_term, category, quality_filter, user_names) # Use elastic is always false here because we only hit this section # if we're browsing without a search term (which means we default to DB) # or if ES is disabled diff --git a/nyaa/views/users.py b/nyaa/views/users.py index decf3f6f5..1bcd460c3 100644 --- a/nyaa/views/users.py +++ b/nyaa/views/users.py @@ -130,7 +130,7 @@ def view_user(user_name): query_args = { 'term': search_term or '', - 'user': user.id, + 'user_ids': (user.id,), # Tuple! 'sort': sort_key or 'id', 'order': sort_order or 'desc', 'category': category or '0_0', @@ -146,7 +146,7 @@ def view_user(user_name): query_args['admin'] = True # Use elastic search for term searching - rss_query_string = _generate_query_string(search_term, category, quality_filter, user_name) + rss_query_string = _generate_query_string(search_term, category, quality_filter, [user_name]) use_elastic = app.config.get('USE_ELASTIC_SEARCH') if use_elastic and search_term: query_args['term'] = search_term