From 883654b7a30066485124424e62962009bb2ffd1b Mon Sep 17 00:00:00 2001 From: Rafael JPD Date: Wed, 6 May 2026 08:28:08 -0300 Subject: [PATCH 1/3] Captura titulo do documento no acesso COUNTER --- metrics/counter/access.py | 159 ++++++++++++++++++++++++++++++-------- 1 file changed, 127 insertions(+), 32 deletions(-) diff --git a/metrics/counter/access.py b/metrics/counter/access.py index 12c0cc5..65f9b27 100644 --- a/metrics/counter/access.py +++ b/metrics/counter/access.py @@ -16,7 +16,10 @@ standardize_year_of_publication, ) from core.utils.date_utils import extract_minute_second_key, truncate_datetime_to_hour -from metrics.counter.identifiers import generate_item_access_id, generate_user_session_id +from metrics.counter.identifiers import ( + generate_item_access_id, + generate_user_session_id, +) def extract_item_access_data(collection_acron3: str, translated_url: dict): @@ -26,7 +29,9 @@ def extract_item_access_data(collection_acron3: str, translated_url: dict): source_type = _extract_source_type(collection_acron3, translated_url) source_id = _extract_source_id(collection_acron3, translated_url, source_type) scielo_issn = _extract_scielo_issn(translated_url, source_type, source_id) - document_type = _extract_document_type(collection_acron3, translated_url, source_type) + document_type = _extract_document_type( + collection_acron3, translated_url, source_type + ) publication_year = _safe_standardize( standardize_year_of_publication, translated_url.get("year_of_publication"), @@ -39,6 +44,7 @@ def extract_item_access_data(collection_acron3: str, translated_url: dict): "source_id": source_id, "scielo_issn": scielo_issn, "document_type": document_type, + "document_title": _extract_document_title(translated_url, document_type), "pid_v2": _safe_standardize(standardize_pid_v2, translated_url.get("pid_v2")), "pid_v3": _safe_standardize(standardize_pid_v3, translated_url.get("pid_v3")), "pid_generic": _safe_standardize( @@ -59,7 +65,8 @@ def extract_item_access_data(collection_acron3: str, translated_url: dict): ), "media_format": translated_url.get("media_format"), "content_type": translated_url.get("content_type"), - "access_url": translated_url.get("access_url") or translated_url.get("normalized_url"), + "access_url": translated_url.get("access_url") + or translated_url.get("normalized_url"), "publication_year": publication_year, "counter_access_type": _counter_access_type(source_access_type), "access_method": "Regular", @@ -73,7 +80,9 @@ def extract_item_access_data(collection_acron3: str, translated_url: dict): "source_publisher_name": translated_url.get("source_publisher_name") or translated_url.get("journal_publisher_name"), "source_access_type": source_access_type, - "source_identifiers": _extract_source_identifiers(translated_url, source_id, source_type), + "source_identifiers": _extract_source_identifiers( + translated_url, source_id, source_type + ), "source_city": translated_url.get("source_city"), "source_country": translated_url.get("source_country"), } @@ -81,7 +90,10 @@ def extract_item_access_data(collection_acron3: str, translated_url: dict): def is_valid_item_access_data(data: dict, utm=None, ignore_utm_validation=False): if not isinstance(data, dict): - return False, {"message": "Invalid data format. Expected a dictionary.", "code": "invalid_format"} + return False, { + "message": "Invalid data format. Expected a dictionary.", + "code": "invalid_format", + } scielo_issn = data.get("scielo_issn") source_id = data.get("source_id") @@ -96,23 +108,46 @@ def is_valid_item_access_data(data: dict, utm=None, ignore_utm_validation=False) has_source_identity = bool(source_id) or bool( scielo_issn and scielo_issn != DEFAULT_SCIELO_ISSN ) - has_media_language = bool(media_language and media_language != MEDIA_LANGUAGE_UNDEFINED) + has_media_language = bool( + media_language and media_language != MEDIA_LANGUAGE_UNDEFINED + ) has_pid = bool(pid_v2 or pid_v3 or pid_generic) - if not all([media_format and media_format != MEDIA_FORMAT_UNDEFINED, content_type and content_type != CONTENT_TYPE_UNDEFINED, has_pid]): - return False, {"message": "Missing required fields in item access data.", "code": "missing_fields"} + if not all( + [ + media_format and media_format != MEDIA_FORMAT_UNDEFINED, + content_type and content_type != CONTENT_TYPE_UNDEFINED, + has_pid, + ] + ): + return False, { + "message": "Missing required fields in item access data.", + "code": "missing_fields", + } if document_type in {"article", "book", "chapter"} and not has_media_language: - return False, {"message": "Missing media language in item access data.", "code": "missing_fields"} + return False, { + "message": "Missing media language in item access data.", + "code": "missing_fields", + } if document_type == "article" and not has_source_identity: - return False, {"message": "Missing article source identity.", "code": "missing_fields"} + return False, { + "message": "Missing article source identity.", + "code": "missing_fields", + } if document_type in {"book", "chapter"} and not source_id: - return False, {"message": "Missing book source identity.", "code": "missing_fields"} + return False, { + "message": "Missing book source identity.", + "code": "missing_fields", + } if document_type in {"preprint", "dataset"} and not pid_generic: - return False, {"message": "Missing generic PID in item access data.", "code": "missing_fields"} + return False, { + "message": "Missing generic PID in item access data.", + "code": "missing_fields", + } if utm and not ignore_utm_validation: if ( @@ -121,7 +156,10 @@ def is_valid_item_access_data(data: dict, utm=None, ignore_utm_validation=False) and scielo_issn != DEFAULT_SCIELO_ISSN and not utm.is_valid_code(scielo_issn, utm.sources_metadata["issn_set"]) ): - return False, {"message": f"Invalid scielo_issn: {scielo_issn}", "code": "invalid_scielo_issn"} + return False, { + "message": f"Invalid scielo_issn: {scielo_issn}", + "code": "invalid_scielo_issn", + } if ( source_type @@ -129,21 +167,37 @@ def is_valid_item_access_data(data: dict, utm=None, ignore_utm_validation=False) and source_id and source_id not in utm.sources_metadata.get("source_id_to_type", {}) ): - return False, {"message": f"Invalid source_id: {source_id}", "code": "invalid_source_id"} + return False, { + "message": f"Invalid source_id: {source_id}", + "code": "invalid_source_id", + } if pid_v2 and not utm.is_valid_code(pid_v2, utm.documents_metadata["pid_set"]): - return False, {"message": f"Invalid pid_v2: {pid_v2}", "code": "invalid_pid_v2"} + return False, { + "message": f"Invalid pid_v2: {pid_v2}", + "code": "invalid_pid_v2", + } if pid_v3 and not utm.is_valid_code(pid_v3, utm.documents_metadata["pid_set"]): - return False, {"message": f"Invalid pid_v3: {pid_v3}", "code": "invalid_pid_v3"} + return False, { + "message": f"Invalid pid_v3: {pid_v3}", + "code": "invalid_pid_v3", + } - if pid_generic and not utm.is_valid_code(pid_generic, utm.documents_metadata["pid_set"]): - return False, {"message": f"Invalid pid_generic: {pid_generic}", "code": "invalid_pid_generic"} + if pid_generic and not utm.is_valid_code( + pid_generic, utm.documents_metadata["pid_set"] + ): + return False, { + "message": f"Invalid pid_generic: {pid_generic}", + "code": "invalid_pid_generic", + } return True, {"message": "Item access data is valid.", "code": "valid"} -def update_results_with_item_access_data(results: dict, item_access_data: dict, line: dict): +def update_results_with_item_access_data( + results: dict, item_access_data: dict, line: dict +): col_acron3 = item_access_data.get("collection") source_key = ( item_access_data.get("source_id") @@ -156,7 +210,9 @@ def update_results_with_item_access_data(results: dict, item_access_data: dict, media_format = item_access_data.get("media_format") content_language = item_access_data.get("media_language") content_type = item_access_data.get("content_type") - access_url = item_access_data.get("access_url") or _normalize_access_url(line.get("url")) + access_url = item_access_data.get("access_url") or _normalize_access_url( + line.get("url") + ) client_name = line.get("client_name") client_version = line.get("client_version") @@ -202,6 +258,7 @@ def update_results_with_item_access_data(results: dict, item_access_data: dict, "pid_v2": pid_v2, "pid_v3": pid_v3, "pid_generic": access_target.get("pid_generic"), + "document": _build_document(item_access_data), "title_pid_generic": ( item_access_data.get("title_pid_generic") or access_target.get("pid_generic") @@ -218,7 +275,8 @@ def update_results_with_item_access_data(results: dict, item_access_data: dict, "access_year": access_year, "access_month": access_month, "publication_year": item_access_data.get("publication_year"), - "counter_access_type": item_access_data.get("counter_access_type") or "Open", + "counter_access_type": item_access_data.get("counter_access_type") + or "Open", "access_method": item_access_data.get("access_method") or "Regular", "source": { "source_type": item_access_data.get("source_type"), @@ -229,7 +287,9 @@ def update_results_with_item_access_data(results: dict, item_access_data: dict, "access_type": item_access_data.get("source_access_type"), "city": item_access_data.get("source_city"), "country": item_access_data.get("source_country"), - "subject_area_capes": item_access_data.get("source_subject_area_capes"), + "subject_area_capes": item_access_data.get( + "source_subject_area_capes" + ), "subject_area_wos": item_access_data.get("source_subject_area_wos"), "acronym": item_access_data.get("source_acronym"), "publisher_name": item_access_data.get("source_publisher_name"), @@ -246,7 +306,9 @@ def update_results_with_item_access_data(results: dict, item_access_data: dict, media_format, content_type, ) - timestamps_by_url = results[item_access_id].setdefault("click_timestamps_by_url", {}) + timestamps_by_url = results[item_access_id].setdefault( + "click_timestamps_by_url", {} + ) url_timestamps = timestamps_by_url.setdefault(access_url_key, {}) if ms_key not in url_timestamps: url_timestamps[ms_key] = 0 @@ -276,7 +338,9 @@ def _extract_source_type(collection_acron3, translated_url): ): return "journal" - if translated_url.get("journal_acronym") or translated_url.get("journal_main_title"): + if translated_url.get("journal_acronym") or translated_url.get( + "journal_main_title" + ): return "journal" return "other" @@ -328,6 +392,18 @@ def _extract_source_title(translated_url): ) +def _extract_document_title(translated_url, document_type): + if document_type == "chapter": + return translated_url.get("chapter_title") + if document_type == "book": + return translated_url.get("book_title") + return ( + translated_url.get("document_title") + or translated_url.get("article_title") + or translated_url.get("title") + ) + + def _extract_document_type(collection_acron3, translated_url, source_type): document_type = translated_url.get("document_type") if document_type: @@ -356,7 +432,11 @@ def _extract_document_type(collection_acron3, translated_url, source_type): def _extract_source_identifiers(translated_url, source_id, source_type): identifiers = translated_url.get("source_identifiers") if isinstance(identifiers, dict): - compact = {key: value for key, value in identifiers.items() if value not in (None, "", [], {}, ())} + compact = { + key: value + for key, value in identifiers.items() + if value not in (None, "", [], {}, ()) + } if compact: return compact @@ -369,7 +449,11 @@ def _extract_source_identifiers(translated_url, source_id, source_type): "eisbn": translated_url.get("eisbn"), "doi": translated_url.get("doi"), } - compact = {key: value for key, value in compact.items() if value not in (None, "", [], {}, ())} + compact = { + key: value + for key, value in compact.items() + if value not in (None, "", [], {}, ()) + } return compact or None @@ -409,6 +493,13 @@ def _standardize_pid_generic_list(values): return items +def _build_document(item_access_data): + title = item_access_data.get("document_title") + if not title: + return {} + return {"title": title} + + def _iter_access_targets(item_access_data): return [ { @@ -422,7 +513,9 @@ def _normalize_access_url(url): if not url: return None parsed_url = urlparse(str(url).strip()) - path = parsed_url.path if parsed_url.scheme or parsed_url.netloc else str(url).strip() + path = ( + parsed_url.path if parsed_url.scheme or parsed_url.netloc else str(url).strip() + ) path = unquote(path or "") path = path.split("?", 1)[0].split("#", 1)[0].split()[0] path = re.sub(r"/+", "/", path) @@ -431,8 +524,10 @@ def _normalize_access_url(url): def _fallback_access_url_key(pid_generic, media_format, content_type): - return "|".join([ - str(pid_generic or ""), - str(media_format or ""), - str(content_type or ""), - ]) + return "|".join( + [ + str(pid_generic or ""), + str(media_format or ""), + str(content_type or ""), + ] + ) From 5f1fd28d5e5e0b2463538f2db83c8687df4946a9 Mon Sep 17 00:00:00 2001 From: Rafael JPD Date: Wed, 6 May 2026 08:28:29 -0300 Subject: [PATCH 2/3] Reorganiza documentos e mappings de metricas --- metrics/counter/documents.py | 280 ++++++++++++++++++++---------- metrics/opensearch/mappings.py | 206 ++++++++-------------- metrics/services/export.py | 9 +- metrics/tests/test_index_utils.py | 194 ++++++++++++++++----- metrics/tests/test_opensearch.py | 48 ++++- 5 files changed, 458 insertions(+), 279 deletions(-) diff --git a/metrics/counter/documents.py b/metrics/counter/documents.py index 63730ae..e13c0cf 100644 --- a/metrics/counter/documents.py +++ b/metrics/counter/documents.py @@ -9,7 +9,10 @@ parent_data_type, should_create_book_item_document, ) -from metrics.counter.identifiers import generate_month_document_id, generate_year_document_id +from metrics.counter.identifiers import ( + generate_month_document_id, + generate_year_document_id, +) def convert_to_month_index_documents(data: dict): @@ -168,7 +171,9 @@ def _generate_document_id(value, granularity, metric_scope=None, pid_generic=Non pid_generic = pid_generic or value.get("pid_generic") publication_year = str(value.get("publication_year") or "0001") if granularity == "month": - access_month = value.get("access_date", "")[:7] if value.get("access_date") else "" + access_month = ( + value.get("access_date", "")[:7] if value.get("access_date") else "" + ) return generate_month_document_id( collection=value.get("collection"), source_key=value.get("source_key"), @@ -198,125 +203,224 @@ def _generate_document_id(value, granularity, metric_scope=None, pid_generic=Non ) -def _build_base_document(value, granularity, metric_scope=None, pid_generic=None, document_type=None): +def _build_base_document( + value, granularity, metric_scope=None, pid_generic=None, document_type=None +): collection = value.get("collection") + scope = metric_scope or "item" if collection == "books": - normalized_pid_generic = pid_generic or value.get("pid_generic") - title_pid_generic = extract_title_pid_generic(value, fallback=normalized_pid_generic) + document_id = pid_generic or value.get("pid_generic") + parent_id = extract_title_pid_generic(value, fallback=document_id) + if parent_id == document_id or scope == "title": + parent_id = None + raw_source = value.get("source") or {} + source = _build_source(raw_source) base_document = { "collection": collection, - "source": _build_books_source(value.get("source")), - "document_type": document_type or value.get("document_type"), - "scielo_document_type": document_type or value.get("document_type"), - "metric_scope": metric_scope or "item", - "counter_data_type": "Book" if metric_scope == "title" else "Book_Segment", - "parent_data_type": "Book" if metric_scope != "title" else None, - "title_pid_generic": title_pid_generic, - "pid": normalized_pid_generic, - "pid_generic": normalized_pid_generic, - "publication_year": value.get("publication_year"), - "counter_access_type": value.get("counter_access_type") or "Open", - "access_method": value.get("access_method") or "Regular", + "source": source, + "document": _build_document( + value=value, + document_id=document_id, + document_type=document_type or value.get("document_type"), + parent_id=parent_id, + source_identifiers=raw_source.get("identifiers"), + metric_scope=scope, + ), + "counter": _compact_dict( + { + "metric_scope": scope, + "data_type": "Book" if scope == "title" else "Book_Segment", + "parent_data_type": "Book" if scope != "title" else None, + "access_type": value.get("counter_access_type") or "Open", + "access_method": value.get("access_method") or "Regular", + } + ), "total_requests": 0, "total_investigations": 0, "unique_requests": 0, "unique_investigations": 0, } - _apply_access_fields(base_document, value, granularity) - if granularity == "year": - base_document["content_language"] = value.get("content_language") - base_document["access_country_code"] = value.get("access_country_code") + base_document["access"] = _build_access(value, granularity) + if granularity == "month": + base_document["daily_metrics"] = _build_daily_metrics(value) return base_document + document_type = value.get("document_type") + document_id = value.get("pid_v3") or value.get("pid_v2") or value.get("pid_generic") base_document = { "collection": collection, - "source": _build_standard_source(value.get("source")), - "document_type": value.get("document_type"), - "scielo_document_type": value.get("document_type"), - "metric_scope": "item", - "counter_data_type": counter_data_type(value.get("document_type")), - "parent_data_type": parent_data_type( - value.get("document_type"), - (value.get("source") or {}).get("source_type"), + "source": _build_source(value.get("source")), + "document": _build_document( + value=value, + document_id=document_id, + document_type=document_type, + ), + "counter": _compact_dict( + { + "metric_scope": "item", + "data_type": counter_data_type(document_type), + "parent_data_type": parent_data_type( + document_type, + (value.get("source") or {}).get("source_type"), + ), + "article_version": article_version(document_type), + "access_type": value.get("counter_access_type") or "Open", + "access_method": value.get("access_method") or "Regular", + } ), - "article_version": article_version(value.get("document_type")), - "pid": value.get("pid_v3") or value.get("pid_v2") or value.get("pid_generic"), - "pid_v2": value.get("pid_v2"), - "pid_v3": value.get("pid_v3"), - "pid_generic": value.get("pid_generic"), - "publication_year": value.get("publication_year"), - "counter_access_type": value.get("counter_access_type") or "Open", - "access_method": value.get("access_method") or "Regular", "total_requests": 0, "total_investigations": 0, "unique_requests": 0, "unique_investigations": 0, } - _apply_access_fields(base_document, value, granularity) - if granularity == "year": - base_document["content_language"] = value.get("content_language") - base_document["access_country_code"] = value.get("access_country_code") + base_document["access"] = _build_access(value, granularity) + if granularity == "month": + base_document["daily_metrics"] = _build_daily_metrics(value) return base_document -def _apply_access_fields(base_document, value, granularity): +def _build_access(value, granularity): if granularity == "month": - base_document["access_month"] = value.get("access_date", "")[:7] if value.get("access_date") else "" - day = value.get("access_date", "")[-2:] if value.get("access_date") else "01" - base_document["daily_metrics"] = { - day: { - "total_requests": 0, - "total_investigations": 0, - "unique_requests": 0, - "unique_investigations": 0, - } + return { + "month": value.get("access_date", "")[:7] + if value.get("access_date") + else "" } - return - base_document["access_year"] = value.get("access_year") + return _compact_dict( + { + "year": value.get("access_year"), + "country_code": value.get("access_country_code"), + "content_language": value.get("content_language"), + } + ) -def _build_books_source(source): - source = source or {} - identifiers = source.get("identifiers") or {} - compact_identifiers = { - key: value - for key, value in identifiers.items() - if key in {"book_id", "isbn", "eisbn", "doi"} and value not in (None, "", [], {}, ()) +def _build_daily_metrics(value): + day = value.get("access_date", "")[-2:] if value.get("access_date") else "01" + return { + day: { + "total_requests": 0, + "total_investigations": 0, + "unique_requests": 0, + "unique_investigations": 0, + } } - return { - "source_type": source.get("source_type"), - "source_id": source.get("source_id"), - "main_title": source.get("main_title"), - "access_type": source.get("access_type"), - "publisher": source.get("publisher_name"), - "city": source.get("city"), - "country": source.get("country"), - "identifiers": compact_identifiers, + +def _build_document( + value, + document_id, + document_type, + parent_id=None, + source_identifiers=None, + metric_scope="item", +): + document = value.get("document") or {} + title = document.get("title") + if metric_scope == "title": + title = (value.get("source") or {}).get("main_title") or title + + identifiers = _document_identifiers( + value=value, + document_id=document_id, + source_identifiers=source_identifiers, + metric_scope=metric_scope, + ) + + return _compact_dict( + { + "id": document_id, + "type": document_type, + "title": title, + "parent_id": parent_id, + "publication_year": value.get("publication_year"), + "identifiers": identifiers, + } + ) + + +def _document_identifiers( + value, document_id, source_identifiers=None, metric_scope="item" +): + if value.get("collection") == "books" and metric_scope == "title": + identifiers = _book_identifiers_from_pid(document_id) + identifiers.update(source_identifiers or {}) + return _compact_identifiers(identifiers, canonical_id=document_id) + + document_identifiers = (value.get("document") or {}).get("identifiers") or {} + identifiers = { + "pid_v2": value.get("pid_v2"), + "pid_v3": value.get("pid_v3"), + "pid_generic": value.get("pid_generic"), } + identifiers.update(document_identifiers) + if value.get("collection") == "books": + identifiers.update(_book_identifiers_from_pid(value.get("pid_generic"))) + identifiers.update(source_identifiers or {}) -def _build_standard_source(source): + return _compact_identifiers(identifiers, canonical_id=document_id) + + +def _book_identifiers_from_pid(pid_generic): + value = str(pid_generic or "") + if not value.upper().startswith("BOOK:"): + return {} + + identifiers = {} + parts = value.split("/", 1) + book_id = parts[0].split(":", 1)[1] if ":" in parts[0] else "" + if book_id: + identifiers["book_id"] = book_id + + if len(parts) > 1 and parts[1].upper().startswith("CHAPTER:"): + chapter_id = parts[1].split(":", 1)[1] if ":" in parts[1] else "" + if chapter_id: + identifiers["chapter_id"] = chapter_id + + return identifiers + + +def _build_source(source): source = source or {} - identifiers = source.get("identifiers") or {} - compact_identifiers = { - key: value - for key, value in identifiers.items() - if value not in (None, "", [], {}, ()) - } + source_id = source.get("source_id") + source_type = source.get("source_type") + identifiers = _compact_identifiers( + source.get("identifiers") or {}, canonical_id=source_id + ) + + return _compact_dict( + { + "id": source_id, + "type": source_type, + "title": source.get("main_title"), + "scielo_issn": None if source_type == "book" else source.get("scielo_issn"), + "acronym": source.get("acronym"), + "publisher_name": source.get("publisher_name"), + "subject_area_capes": source.get("subject_area_capes"), + "subject_area_wos": source.get("subject_area_wos"), + "access_type": source.get("access_type"), + "city": source.get("city"), + "country": source.get("country"), + "identifiers": identifiers, + } + ) + + +def _compact_identifiers(identifiers, canonical_id=None): + compact = {} + canonical_value = str(canonical_id or "").strip().upper() + for key, value in (identifiers or {}).items(): + if value in (None, "", [], {}, ()): + continue + if canonical_value and str(value).strip().upper() == canonical_value: + continue + compact[key] = value + return compact + +def _compact_dict(data): return { - "source_type": source.get("source_type"), - "source_id": source.get("source_id"), - "scielo_issn": source.get("scielo_issn"), - "main_title": source.get("main_title"), - "acronym": source.get("acronym"), - "publisher_name": source.get("publisher_name"), - "subject_area_capes": source.get("subject_area_capes"), - "subject_area_wos": source.get("subject_area_wos"), - "access_type": source.get("access_type"), - "city": source.get("city"), - "country": source.get("country"), - "identifiers": compact_identifiers, + key: value for key, value in data.items() if value not in (None, "", [], {}, ()) } diff --git a/metrics/opensearch/mappings.py b/metrics/opensearch/mappings.py index 5825c1b..def652f 100644 --- a/metrics/opensearch/mappings.py +++ b/metrics/opensearch/mappings.py @@ -1,162 +1,92 @@ -YEAR_INDEX_MAPPINGS = { +TEXT_KEYWORD_MAPPING = { + "type": "text", + "fields": {"keyword": {"type": "keyword", "ignore_above": 512}}, +} + +IDENTIFIERS_MAPPING = {"type": "object", "dynamic": True} + +DOCUMENT_MAPPINGS = { "properties": { - "collection": {"type": "keyword"}, - "source": { - "properties": { - "source_type": {"type": "keyword"}, - "source_id": {"type": "keyword"}, - "scielo_issn": {"type": "keyword"}, - "main_title": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 512 - } - } - }, - "subject_area_capes": {"type": "keyword"}, - "subject_area_wos": {"type": "keyword"}, - "acronym": {"type": "keyword"}, - "publisher_name": {"type": "keyword"}, - "access_type": {"type": "keyword"}, - "city": {"type": "keyword"}, - "country": {"type": "keyword"}, - "identifiers": {"type": "object"}, - } - }, - "document_type": {"type": "keyword"}, - "scielo_document_type": {"type": "keyword"}, - "metric_scope": {"type": "keyword"}, - "counter_data_type": {"type": "keyword"}, - "parent_data_type": {"type": "keyword"}, - "article_version": {"type": "keyword"}, - "pid": {"type": "keyword"}, - "pid_v2": {"type": "keyword"}, - "pid_v3": {"type": "keyword"}, - "pid_generic": {"type": "keyword"}, + "id": {"type": "keyword"}, + "type": {"type": "keyword"}, + "title": TEXT_KEYWORD_MAPPING, + "parent_id": {"type": "keyword"}, "publication_year": {"type": "integer"}, - "counter_access_type": {"type": "keyword"}, - "access_method": {"type": "keyword"}, - "access_year": {"type": "date", "format": "yyyy"}, - "access_country_code": {"type": "keyword"}, - "content_language": {"type": "keyword"}, - "applied_jobs": {"type": "keyword", "index": False}, - "total_requests": {"type": "integer"}, - "total_investigations": {"type": "integer"}, - "unique_requests": {"type": "integer"}, - "unique_investigations": {"type": "integer"}, + "identifiers": IDENTIFIERS_MAPPING, } } +SOURCE_MAPPINGS = { + "properties": { + "id": {"type": "keyword"}, + "type": {"type": "keyword"}, + "title": TEXT_KEYWORD_MAPPING, + "scielo_issn": {"type": "keyword"}, + "acronym": {"type": "keyword"}, + "publisher_name": {"type": "keyword"}, + "access_type": {"type": "keyword"}, + "city": {"type": "keyword"}, + "country": {"type": "keyword"}, + "subject_area_capes": {"type": "keyword"}, + "subject_area_wos": {"type": "keyword"}, + "identifiers": IDENTIFIERS_MAPPING, + } +} -MONTH_INDEX_MAPPINGS = { +COUNTER_MAPPINGS = { "properties": { - "collection": {"type": "keyword"}, - "source": YEAR_INDEX_MAPPINGS["properties"]["source"], - "document_type": {"type": "keyword"}, - "scielo_document_type": {"type": "keyword"}, "metric_scope": {"type": "keyword"}, - "counter_data_type": {"type": "keyword"}, + "data_type": {"type": "keyword"}, "parent_data_type": {"type": "keyword"}, "article_version": {"type": "keyword"}, - "pid": {"type": "keyword"}, - "pid_v2": {"type": "keyword"}, - "pid_v3": {"type": "keyword"}, - "pid_generic": {"type": "keyword"}, - "publication_year": {"type": "integer"}, - "counter_access_type": {"type": "keyword"}, + "access_type": {"type": "keyword"}, "access_method": {"type": "keyword"}, - "access_month": {"type": "date", "format": "yyyy-MM"}, - "applied_jobs": {"type": "keyword", "index": False}, - "daily_metrics": {"type": "object", "dynamic": True}, - "total_requests": {"type": "integer"}, - "total_investigations": {"type": "integer"}, - "unique_requests": {"type": "integer"}, - "unique_investigations": {"type": "integer"}, } } +MONTH_ACCESS_MAPPINGS = { + "properties": { + "month": {"type": "date", "format": "yyyy-MM"}, + } +} -BOOKS_YEAR_INDEX_MAPPINGS = { +YEAR_ACCESS_MAPPINGS = { "properties": { - "collection": {"type": "keyword"}, - "source": { - "properties": { - "source_type": {"type": "keyword"}, - "source_id": {"type": "keyword"}, - "main_title": { - "type": "text", - "fields": { - "keyword": { - "type": "keyword", - "ignore_above": 512 - } - } - }, - "access_type": {"type": "keyword"}, - "publisher": {"type": "keyword"}, - "city": {"type": "keyword"}, - "country": {"type": "keyword"}, - "identifiers": { - "properties": { - "book_id": {"type": "keyword"}, - "isbn": {"type": "keyword"}, - "eisbn": {"type": "keyword"}, - "doi": {"type": "keyword"}, - } - }, - } - }, - "document_type": {"type": "keyword"}, - "scielo_document_type": {"type": "keyword"}, - "metric_scope": {"type": "keyword"}, - "counter_data_type": {"type": "keyword"}, - "parent_data_type": {"type": "keyword"}, - "article_version": {"type": "keyword"}, - "pid": {"type": "keyword"}, - "pid_generic": {"type": "keyword"}, - "title_pid_generic": {"type": "keyword"}, - "publication_year": {"type": "integer"}, - "counter_access_type": {"type": "keyword"}, - "access_method": {"type": "keyword"}, - "access_year": {"type": "date", "format": "yyyy"}, - "access_country_code": {"type": "keyword"}, + "year": {"type": "date", "format": "yyyy"}, + "country_code": {"type": "keyword"}, "content_language": {"type": "keyword"}, - "applied_jobs": {"type": "keyword", "index": False}, - "total_requests": {"type": "integer"}, - "total_investigations": {"type": "integer"}, - "unique_requests": {"type": "integer"}, - "unique_investigations": {"type": "integer"}, } } +METRIC_PROPERTIES = { + "total_requests": {"type": "integer"}, + "total_investigations": {"type": "integer"}, + "unique_requests": {"type": "integer"}, + "unique_investigations": {"type": "integer"}, +} -BOOKS_MONTH_INDEX_MAPPINGS = { - "properties": { + +def _build_index_mappings(granularity): + properties = { "collection": {"type": "keyword"}, - "source": BOOKS_YEAR_INDEX_MAPPINGS["properties"]["source"], - "document_type": {"type": "keyword"}, - "scielo_document_type": {"type": "keyword"}, - "metric_scope": {"type": "keyword"}, - "counter_data_type": {"type": "keyword"}, - "parent_data_type": {"type": "keyword"}, - "article_version": {"type": "keyword"}, - "pid": {"type": "keyword"}, - "pid_generic": {"type": "keyword"}, - "title_pid_generic": {"type": "keyword"}, - "publication_year": {"type": "integer"}, - "counter_access_type": {"type": "keyword"}, - "access_method": {"type": "keyword"}, - "access_month": {"type": "date", "format": "yyyy-MM"}, + "source": SOURCE_MAPPINGS, + "document": DOCUMENT_MAPPINGS, + "access": MONTH_ACCESS_MAPPINGS + if granularity == "month" + else YEAR_ACCESS_MAPPINGS, + "counter": COUNTER_MAPPINGS, "applied_jobs": {"type": "keyword", "index": False}, - "daily_metrics": {"type": "object", "dynamic": True}, - "total_requests": {"type": "integer"}, - "total_investigations": {"type": "integer"}, - "unique_requests": {"type": "integer"}, - "unique_investigations": {"type": "integer"}, + **METRIC_PROPERTIES, } -} + if granularity == "month": + properties["daily_metrics"] = {"type": "object", "dynamic": True} + return {"properties": properties} + + +YEAR_INDEX_MAPPINGS = _build_index_mappings("year") +MONTH_INDEX_MAPPINGS = _build_index_mappings("month") +BOOKS_YEAR_INDEX_MAPPINGS = _build_index_mappings("year") +BOOKS_MONTH_INDEX_MAPPINGS = _build_index_mappings("month") METRIC_FIELDS = ( @@ -172,6 +102,10 @@ def get_index_mappings(collection, granularity): raise ValueError("Granularity must be 'month' or 'year'.") if collection == "books": - return BOOKS_MONTH_INDEX_MAPPINGS if granularity == "month" else BOOKS_YEAR_INDEX_MAPPINGS + return ( + BOOKS_MONTH_INDEX_MAPPINGS + if granularity == "month" + else BOOKS_YEAR_INDEX_MAPPINGS + ) return MONTH_INDEX_MAPPINGS if granularity == "month" else YEAR_INDEX_MAPPINGS diff --git a/metrics/services/export.py b/metrics/services/export.py index 03efbc6..ef5d9f6 100644 --- a/metrics/services/export.py +++ b/metrics/services/export.py @@ -64,22 +64,25 @@ def _sync_documents_group( index_prefix = settings.OPENSEARCH_INDEX_NAME for doc_id, document in documents.items(): + access = document.get("access") or {} if granularity == "month": index_name = generate_month_index_name( index_prefix=index_prefix, collection=collection, - date=f"{document.get('access_month')}-01", + date=f"{access.get('month')}-01", ) mappings = opensearch.get_index_mappings(collection, "month") else: index_name = generate_year_index_name( index_prefix=index_prefix, collection=collection, - date=f"{document.get('access_year')}-01-01", + date=f"{access.get('year')}-01-01", ) mappings = opensearch.get_index_mappings(collection, "year") - grouped_documents.setdefault(index_name, {"mappings": mappings, "documents": {}}) + grouped_documents.setdefault( + index_name, {"mappings": mappings, "documents": {}} + ) grouped_documents[index_name]["documents"][doc_id] = document for index_name, payload in grouped_documents.items(): diff --git a/metrics/tests/test_index_utils.py b/metrics/tests/test_index_utils.py index 562fc42..e0f0aef 100644 --- a/metrics/tests/test_index_utils.py +++ b/metrics/tests/test_index_utils.py @@ -14,7 +14,8 @@ MEDIA_FORMAT_UNDEFINED, ) -from metrics.counter import access, documents as index_docs +from metrics.counter import access +from metrics.counter import documents as index_docs from metrics.opensearch.names import generate_month_index_name, generate_year_index_name @@ -116,7 +117,9 @@ def test_is_valid_item_access_data_content_type_abstract(self): result, _ = access.is_valid_item_access_data(data) self.assertTrue(result) - def test_is_valid_item_access_data_dataset_without_source_or_language_is_valid(self): + def test_is_valid_item_access_data_dataset_without_source_or_language_is_valid( + self, + ): data = { "document_type": "dataset", "scielo_issn": DEFAULT_SCIELO_ISSN, @@ -219,6 +222,46 @@ def test_extract_item_access_data_tolerates_malformed_media_language(self): self.assertEqual(data["media_language"], "un") + def test_extract_item_access_data_sets_document_title_by_type(self): + chapter = access.extract_item_access_data( + "books", + { + "book_id": "q7gtd", + "chapter_id": "03", + "pid_generic": "book:q7gtd/chapter:03", + "book_title": "Book Title", + "chapter_title": "Chapter Title", + "media_format": MEDIA_FORMAT_HTML, + "media_language": "en", + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + book = access.extract_item_access_data( + "books", + { + "book_id": "q7gtd", + "pid_generic": "book:q7gtd", + "book_title": "Book Title", + "media_format": MEDIA_FORMAT_HTML, + "media_language": "en", + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + article = access.extract_item_access_data( + "scl", + { + "scielo_issn": "1234-5678", + "pid_v3": "jGJccQ7bFdbz6wy3nfXGVdv", + "article_title": "Article Title", + "media_format": MEDIA_FORMAT_HTML, + "content_type": CONTENT_TYPE_FULL_TEXT, + }, + ) + + self.assertEqual(chapter["document_title"], "Chapter Title") + self.assertEqual(book["document_title"], "Book Title") + self.assertEqual(article["document_title"], "Article Title") + def test_extract_item_access_data_normalizes_scielo_collection_document_types(self): preprint = access.extract_item_access_data( "preprints", @@ -268,6 +311,7 @@ def test_update_results_with_item_access_data_stores_source_and_periods(self): "media_format": MEDIA_FORMAT_HTML, "content_type": CONTENT_TYPE_FULL_TEXT, "publication_year": "2023", + "document_title": "Book Title", "source_main_title": "Book Title", "source_subject_area_capes": [], "source_subject_area_wos": [], @@ -295,6 +339,7 @@ def test_update_results_with_item_access_data_stores_source_and_periods(self): self.assertEqual(result["access_country_code"], "BR") self.assertEqual(result["content_language"], "en") self.assertEqual(result["title_pid_generic"], "BOOK:Q7GTD") + self.assertEqual(result["document"], {"title": "Book Title"}) self.assertIn("user_session_id", result) def test_update_results_with_item_access_data_rejects_invalid_local_datetime(self): @@ -322,7 +367,9 @@ def test_update_results_with_item_access_data_rejects_invalid_local_datetime(sel self.assertEqual(results, {}) - def test_update_results_with_item_access_data_does_not_expand_book_into_segments(self): + def test_update_results_with_item_access_data_does_not_expand_book_into_segments( + self, + ): results = {} item_access_data = { "collection": "books", @@ -408,7 +455,9 @@ def test_double_click_filter_uses_url_bucket_for_same_item(self): ) metrics_data = index_docs.convert_raw_results_to_index_documents(results) - month_item = metrics_data["month"]["books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018"] + month_item = metrics_data["month"][ + "books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018" + ] self.assertEqual(month_item["total_requests"], 2) self.assertEqual(month_item["unique_requests"], 1) @@ -456,7 +505,9 @@ def test_double_click_filter_collapses_same_url_within_30_seconds(self): ) metrics_data = index_docs.convert_raw_results_to_index_documents(results) - month_item = metrics_data["month"]["books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018"] + month_item = metrics_data["month"][ + "books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018" + ] self.assertEqual(month_item["total_requests"], 1) self.assertEqual(month_item["unique_requests"], 1) @@ -488,6 +539,7 @@ def test_convert_raw_results_to_index_documents_creates_month_and_year_views(sel "pid_v2": None, "pid_v3": None, "pid_generic": "BOOK:Q7GTD/CHAPTER:03", + "document": {"title": "Chapter Title"}, "title_pid_generic": "BOOK:Q7GTD", "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", "click_timestamps": {"00:05": 1}, @@ -523,25 +575,40 @@ def test_convert_raw_results_to_index_documents_creates_month_and_year_views(sel self.assertEqual(len(metrics_data["month"]), 2) self.assertEqual(len(metrics_data["year"]), 2) - month_item = metrics_data["month"]["books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|2024-01|Open|Regular|2023"] - self.assertEqual(month_item["access_month"], "2024-01") + month_item = metrics_data["month"][ + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|2024-01|Open|Regular|2023" + ] + self.assertEqual(month_item["access"], {"month": "2024-01"}) + self.assertIn("daily_metrics", month_item) + self.assertNotIn("by_day", month_item) self.assertNotIn("access_country_code", month_item) self.assertNotIn("content_language", month_item) - self.assertEqual(month_item["document_type"], "chapter") - self.assertEqual(month_item["metric_scope"], "item") - self.assertEqual(month_item["counter_data_type"], "Book_Segment") - self.assertEqual(month_item["title_pid_generic"], "BOOK:Q7GTD") + self.assertEqual(month_item["document"]["id"], "BOOK:Q7GTD/CHAPTER:03") + self.assertEqual(month_item["document"]["type"], "chapter") + self.assertEqual(month_item["document"]["title"], "Chapter Title") + self.assertEqual(month_item["document"]["parent_id"], "BOOK:Q7GTD") + self.assertEqual(month_item["document"]["publication_year"], "2023") + self.assertEqual(month_item["document"]["identifiers"]["book_id"], "q7gtd") + self.assertEqual(month_item["document"]["identifiers"]["chapter_id"], "03") + self.assertEqual(month_item["document"]["identifiers"]["isbn"], "9788578791889") + self.assertNotIn("pid_generic", month_item["document"]["identifiers"]) + self.assertEqual(month_item["counter"]["metric_scope"], "item") + self.assertEqual(month_item["counter"]["data_type"], "Book_Segment") self.assertEqual(month_item["total_requests"], 1) self.assertEqual(month_item["unique_requests"], 1) self.assertNotIn("scielo_issn", month_item["source"]) - self.assertEqual(month_item["source"]["identifiers"]["book_id"], "q7gtd") - self.assertEqual(month_item["source"]["publisher"], ["SciELO Books"]) - - month_title = metrics_data["month"]["title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023"] - self.assertEqual(month_title["document_type"], "book") - self.assertEqual(month_title["metric_scope"], "title") - self.assertEqual(month_title["counter_data_type"], "Book") - self.assertEqual(month_title["pid_generic"], "BOOK:Q7GTD") + self.assertNotIn("book_id", month_item["source"]["identifiers"]) + self.assertEqual(month_item["source"]["publisher_name"], ["SciELO Books"]) + + month_title = metrics_data["month"][ + "title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023" + ] + self.assertEqual(month_title["document"]["id"], "BOOK:Q7GTD") + self.assertEqual(month_title["document"]["type"], "book") + self.assertEqual(month_title["document"]["title"], "Book Title") + self.assertNotIn("parent_id", month_title["document"]) + self.assertEqual(month_title["counter"]["metric_scope"], "title") + self.assertEqual(month_title["counter"]["data_type"], "Book") self.assertEqual(month_title["total_requests"], 1) self.assertEqual(month_title["total_investigations"], 1) self.assertEqual(month_title["unique_requests"], 1) @@ -550,16 +617,25 @@ def test_convert_raw_results_to_index_documents_creates_month_and_year_views(sel year_item = metrics_data["year"][ "books|q7gtd|||BOOK:Q7GTD/CHAPTER:03|en|BR|2024|Open|Regular|2023" ] - self.assertEqual(year_item["access_year"], "2024") - self.assertEqual(year_item["access_country_code"], "BR") - self.assertEqual(year_item["content_language"], "en") - self.assertEqual(year_item["metric_scope"], "item") + self.assertEqual( + year_item["access"], + {"year": "2024", "country_code": "BR", "content_language": "en"}, + ) + self.assertNotIn("daily_metrics", year_item) + self.assertNotIn("by_day", year_item) + self.assertNotIn("access_month", year_item) + self.assertEqual(year_item["document"]["title"], "Chapter Title") + self.assertEqual(year_item["counter"]["metric_scope"], "item") self.assertEqual(year_item["total_requests"], 1) year_title = metrics_data["year"][ "title|books|q7gtd|||BOOK:Q7GTD|en|BR|2024|Open|Regular|2023" ] - self.assertEqual(year_title["metric_scope"], "title") + self.assertEqual(year_title["counter"]["metric_scope"], "title") + self.assertEqual(year_title["document"]["title"], "Book Title") + self.assertNotIn("daily_metrics", year_title) + self.assertNotIn("by_day", year_title) + self.assertNotIn("access_month", year_title) self.assertEqual(year_title["total_requests"], 1) self.assertEqual(year_title["total_investigations"], 1) self.assertEqual(year_title["unique_requests"], 1) @@ -615,13 +691,18 @@ def test_convert_raw_results_to_index_documents_maps_counter_data_types(self): "data|scielo-data|||10.48331/SCIELODATA.ABC123|2024-01|Open|Regular|2024" ] - self.assertEqual(preprint_doc["counter_data_type"], "Article") - self.assertEqual(preprint_doc["scielo_document_type"], "preprint") - self.assertEqual(preprint_doc["article_version"], "Preprint") - self.assertEqual(dataset_doc["counter_data_type"], "Dataset") - self.assertIsNone(dataset_doc["article_version"]) - - def test_convert_raw_results_to_index_documents_dedupes_book_unique_item_across_formats(self): + self.assertEqual(preprint_doc["counter"]["data_type"], "Article") + self.assertEqual(preprint_doc["document"]["type"], "preprint") + self.assertEqual(preprint_doc["document"]["id"], "10.1590/SCIELOPREPRINTS.1234") + self.assertNotIn("pid_generic", preprint_doc["document"].get("identifiers", {})) + self.assertNotIn("scielo_document_type", preprint_doc) + self.assertEqual(preprint_doc["counter"]["article_version"], "Preprint") + self.assertEqual(dataset_doc["counter"]["data_type"], "Dataset") + self.assertNotIn("article_version", dataset_doc["counter"]) + + def test_convert_raw_results_to_index_documents_dedupes_book_unique_item_across_formats( + self, + ): data = { "books|c2248|||BOOK:C2248/CHAPTER:03|sess|BR|pt|html|full_text": { "collection": "books", @@ -677,8 +758,12 @@ def test_convert_raw_results_to_index_documents_dedupes_book_unique_item_across_ metrics_data = index_docs.convert_raw_results_to_index_documents(data) - month_item = metrics_data["month"]["books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018"] - month_title = metrics_data["month"]["title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"] + month_item = metrics_data["month"][ + "books|c2248|||BOOK:C2248/CHAPTER:03|2024-01|Open|Regular|2018" + ] + month_title = metrics_data["month"][ + "title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018" + ] self.assertEqual(month_item["total_requests"], 2) self.assertEqual(month_item["total_investigations"], 2) @@ -687,7 +772,9 @@ def test_convert_raw_results_to_index_documents_dedupes_book_unique_item_across_ self.assertEqual(month_title["unique_requests"], 1) self.assertEqual(month_title["unique_investigations"], 1) - def test_convert_raw_results_to_index_documents_skips_book_landing_page_from_item_scope(self): + def test_convert_raw_results_to_index_documents_skips_book_landing_page_from_item_scope( + self, + ): data = { "books|c2248|||BOOK:C2248|sess|BR|pt|html|abstract": { "collection": "books", @@ -696,6 +783,7 @@ def test_convert_raw_results_to_index_documents_skips_book_landing_page_from_ite "pid_v2": None, "pid_v3": None, "pid_generic": "BOOK:C2248", + "document": {"title": "C2248 Book"}, "title_pid_generic": "BOOK:C2248", "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", "click_timestamps": {"00:05": 1}, @@ -727,7 +815,9 @@ def test_convert_raw_results_to_index_documents_skips_book_landing_page_from_ite {"title|books|c2248|||BOOK:C2248|pt|BR|2024|Open|Regular|2018"}, ) - def test_convert_raw_results_to_index_documents_counts_whole_book_without_segments_as_book_segment(self): + def test_convert_raw_results_to_index_documents_counts_whole_book_without_segments_as_book_segment( + self, + ): data = { "books|c2248|||BOOK:C2248|sess|BR|pt|pdf|full_text": { "collection": "books", @@ -736,6 +826,7 @@ def test_convert_raw_results_to_index_documents_counts_whole_book_without_segmen "pid_v2": None, "pid_v3": None, "pid_generic": "BOOK:C2248", + "document": {"title": "C2248 Book"}, "title_pid_generic": "BOOK:C2248", "user_session_id": "browser|1.0|127.0.0.1|2024-01-15|10", "click_timestamps": {"00:05": 1}, @@ -757,13 +848,22 @@ def test_convert_raw_results_to_index_documents_counts_whole_book_without_segmen } metrics_data = index_docs.convert_raw_results_to_index_documents(data) - month_item = metrics_data["month"]["books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"] - month_title = metrics_data["month"]["title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018"] + month_item = metrics_data["month"][ + "books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018" + ] + month_title = metrics_data["month"][ + "title|books|c2248|||BOOK:C2248|2024-01|Open|Regular|2018" + ] - self.assertEqual(month_item["counter_data_type"], "Book_Segment") - self.assertEqual(month_item["metric_scope"], "item") - self.assertEqual(month_title["counter_data_type"], "Book") - self.assertEqual(month_title["metric_scope"], "title") + self.assertEqual(month_item["counter"]["data_type"], "Book_Segment") + self.assertEqual(month_item["counter"]["metric_scope"], "item") + self.assertEqual(month_item["document"]["id"], "BOOK:C2248") + self.assertEqual(month_item["document"]["title"], "C2248 Book") + self.assertNotIn("parent_id", month_item["document"]) + self.assertEqual(month_title["counter"]["data_type"], "Book") + self.assertEqual(month_title["counter"]["metric_scope"], "title") + self.assertEqual(month_title["document"]["id"], "BOOK:C2248") + self.assertEqual(month_title["document"]["title"], "C2248 Book") def test_convert_raw_results_aggregates_multiple_chapters_correctly(self): """Test that accessing multiple chapters creates correct title-level totals""" @@ -819,17 +919,23 @@ def test_convert_raw_results_aggregates_multiple_chapters_correctly(self): self.assertEqual(len(metrics_data["year"]), 3) # 2 items + 1 title # Each item should have total=1, unique=1 - month_item_1 = metrics_data["month"]["books|q7gtd|||BOOK:Q7GTD/CHAPTER:01|2024-01|Open|Regular|2023"] + month_item_1 = metrics_data["month"][ + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:01|2024-01|Open|Regular|2023" + ] self.assertEqual(month_item_1["total_requests"], 1) self.assertEqual(month_item_1["unique_requests"], 1) - month_item_2 = metrics_data["month"]["books|q7gtd|||BOOK:Q7GTD/CHAPTER:02|2024-01|Open|Regular|2023"] + month_item_2 = metrics_data["month"][ + "books|q7gtd|||BOOK:Q7GTD/CHAPTER:02|2024-01|Open|Regular|2023" + ] self.assertEqual(month_item_2["total_requests"], 1) self.assertEqual(month_item_2["unique_requests"], 1) # Title should have total=2 (sum of both chapters) # Title unique should be 1 (same session accessed book, counted once) - month_title = metrics_data["month"]["title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023"] + month_title = metrics_data["month"][ + "title|books|q7gtd|||BOOK:Q7GTD|2024-01|Open|Regular|2023" + ] self.assertEqual(month_title["total_requests"], 2) self.assertEqual(month_title["total_investigations"], 2) self.assertEqual(month_title["unique_requests"], 1) diff --git a/metrics/tests/test_opensearch.py b/metrics/tests/test_opensearch.py index 80586f9..49e21b3 100644 --- a/metrics/tests/test_opensearch.py +++ b/metrics/tests/test_opensearch.py @@ -52,10 +52,41 @@ def test_get_index_mappings_returns_books_specific_mappings(self): opensearch.get_index_mappings("books", "year"), opensearch.BOOKS_YEAR_INDEX_MAPPINGS, ) - self.assertIn("metric_scope", opensearch.BOOKS_MONTH_INDEX_MAPPINGS["properties"]) - self.assertIn("counter_data_type", opensearch.BOOKS_YEAR_INDEX_MAPPINGS["properties"]) - self.assertIn("title_pid_generic", opensearch.BOOKS_YEAR_INDEX_MAPPINGS["properties"]) - self.assertIn("applied_jobs", opensearch.BOOKS_MONTH_INDEX_MAPPINGS["properties"]) + self.assertIn("counter", opensearch.BOOKS_MONTH_INDEX_MAPPINGS["properties"]) + self.assertIn("access", opensearch.BOOKS_YEAR_INDEX_MAPPINGS["properties"]) + self.assertIn( + "applied_jobs", opensearch.BOOKS_MONTH_INDEX_MAPPINGS["properties"] + ) + for mappings in ( + opensearch.MONTH_INDEX_MAPPINGS, + opensearch.YEAR_INDEX_MAPPINGS, + opensearch.BOOKS_MONTH_INDEX_MAPPINGS, + opensearch.BOOKS_YEAR_INDEX_MAPPINGS, + ): + for removed_field in ( + "document_type", + "scielo_document_type", + "pid", + "pid_v2", + "pid_v3", + "pid_generic", + "title_pid_generic", + "counter_data_type", + "access_month", + "access_year", + ): + self.assertNotIn(removed_field, mappings["properties"]) + document_mapping = mappings["properties"]["document"] + self.assertEqual(document_mapping["properties"]["id"]["type"], "keyword") + self.assertEqual(document_mapping["properties"]["title"]["type"], "text") + self.assertEqual( + document_mapping["properties"]["title"]["fields"]["keyword"]["type"], + "keyword", + ) + self.assertEqual( + mappings["properties"]["source"]["properties"]["id"]["type"], + "keyword", + ) @patch("metrics.opensearch.client.helpers.bulk") @patch.object(opensearch.OpenSearchUsageClient, "get_opensearch_client") @@ -72,9 +103,8 @@ def test_increment_documents_for_daily_job_uses_applied_jobs( documents={ "doc-1": { "collection": "books", - "pid": "BOOK:WD", - "pid_generic": "BOOK:WD", - "access_date": "2025-06-03", + "document": {"id": "BOOK:WD"}, + "access": {"month": "2025-06"}, "total_requests": 3, "total_investigations": 4, "unique_requests": 2, @@ -88,5 +118,7 @@ def test_increment_documents_for_daily_job_uses_applied_jobs( self.assertEqual(len(actions), 1) action = actions[0] self.assertEqual(action["_op_type"], "update") - self.assertEqual(action["script"]["params"]["job_id"], "books|2025-06-03|abc123") + self.assertEqual( + action["script"]["params"]["job_id"], "books|2025-06-03|abc123" + ) self.assertEqual(action["upsert"], {"applied_jobs": []}) From 32d0fbaf0157627ccd6919653e2b8e80c4843227 Mon Sep 17 00:00:00 2001 From: Rafael JPD Date: Wed, 6 May 2026 08:28:34 -0300 Subject: [PATCH 3/3] Adapta exportacao mensal R5.1 de livros --- .../export_book_r51_monthly_metrics.py | 58 ++++++++++++------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/metrics/management/commands/export_book_r51_monthly_metrics.py b/metrics/management/commands/export_book_r51_monthly_metrics.py index 9889387..1d78df0 100644 --- a/metrics/management/commands/export_book_r51_monthly_metrics.py +++ b/metrics/management/commands/export_book_r51_monthly_metrics.py @@ -5,13 +5,14 @@ from device_detector import DeviceDetector from django.core.management.base import BaseCommand, CommandError +from scielo_usage_counter.translator.books import URLTranslatorBooksSite from collection.models import Collection from document.models import Document -from metrics.counter import access, documents as index_docs +from metrics.counter import access +from metrics.counter import documents as index_docs from resources.models import MMDB, RobotUserAgent from scielo_usage_counter import log_handler, url_translator -from scielo_usage_counter.translator.books import URLTranslatorBooksSite from source.models import Source @@ -160,7 +161,9 @@ def handle(self, *args, **options): self.stdout.write(self.style.SUCCESS(f"Item CSV written to {item_output}")) self.stdout.write(self.style.SUCCESS(f"Title CSV written to {title_output}")) if summary_output: - self.stdout.write(self.style.SUCCESS(f"Summary JSON written to {summary_output}")) + self.stdout.write( + self.style.SUCCESS(f"Summary JSON written to {summary_output}") + ) def _parse_file(self, path, parser, utm, collection, ua_cache, results): stats = defaultdict(int) @@ -213,11 +216,13 @@ def _parse_file(self, path, parser, utm, collection, ua_cache, results): if is_bunny: local_datetime = parser.format_date(data.get("unix_ts"), None) - country_code = data.get("country") or parser.geoip.ip_to_country_code( - ip_address - ) + country_code = data.get( + "country" + ) or parser.geoip.ip_to_country_code(ip_address) else: - local_datetime = parser.format_date(data.get("date"), data.get("timezone")) + local_datetime = parser.format_date( + data.get("date"), data.get("timezone") + ) country_code = parser.geoip.ip_to_country_code(ip_address) if not local_datetime: @@ -295,20 +300,23 @@ def _build_monthly_documents(self, results): title_documents = {} for doc in documents["month"].values(): - year_month = doc.get("access_month", "") - scope = doc.get("metric_scope", "item") + access = doc.get("access") or {} + counter = doc.get("counter") or {} + document = doc.get("document") or {} + year_month = access.get("month", "") + scope = counter.get("metric_scope", "item") if scope == "title": + title_id = document.get("id") key = ( year_month, - doc.get("title_pid_generic") or doc.get("pid_generic"), - doc.get("document_type"), + title_id, + document.get("type"), ) if key not in title_documents: title_documents[key] = { "year_month": year_month, - "title_pid_generic": doc.get("title_pid_generic") - or doc.get("pid_generic"), - "document_type": doc.get("document_type"), + "title_pid_generic": title_id, + "document_type": document.get("type"), "total_requests": 0, "total_investigations": 0, "unique_requests": 0, @@ -324,18 +332,20 @@ def _build_monthly_documents(self, results): ) continue + item_id = document.get("id") + title_id = document.get("parent_id") or item_id key = ( year_month, - doc.get("title_pid_generic"), - doc.get("pid_generic"), - doc.get("document_type"), + title_id, + item_id, + document.get("type"), ) if key not in item_documents: item_documents[key] = { "year_month": year_month, - "title_pid_generic": doc.get("title_pid_generic"), - "segment_pid_generic": doc.get("pid_generic"), - "document_type": doc.get("document_type"), + "title_pid_generic": title_id, + "segment_pid_generic": item_id, + "document_type": document.get("type"), "total_requests": 0, "total_investigations": 0, "unique_requests": 0, @@ -390,7 +400,9 @@ def _write_item_csv(path, item_documents): "total_item_requests": doc.get("total_requests", 0), "total_item_investigations": doc.get("total_investigations", 0), "unique_item_requests": doc.get("unique_requests", 0), - "unique_item_investigations": doc.get("unique_investigations", 0), + "unique_item_investigations": doc.get( + "unique_investigations", 0 + ), } ) @@ -426,6 +438,8 @@ def _write_title_csv(path, title_documents): "total_item_requests": doc.get("total_requests", 0), "total_item_investigations": doc.get("total_investigations", 0), "unique_title_requests": doc.get("unique_requests", 0), - "unique_title_investigations": doc.get("unique_investigations", 0), + "unique_title_investigations": doc.get( + "unique_investigations", 0 + ), } )