From 0316c7b403a9c25295c9e734babef6d6b21b0a57 Mon Sep 17 00:00:00 2001 From: pidoubleyou <22942659+pidoubleyou@users.noreply.github.com> Date: Thu, 7 May 2026 22:35:46 +0200 Subject: [PATCH 01/10] erster Zwischenstand --- MServer-Config.yaml | 3 +- pom.xml | 6 + .../mserver/crawler/CrawlerManager.java | 3 + .../tagesschau/TagesschauConstants.java | 27 + .../crawler/tagesschau/TagesschauCrawler.java | 92 + .../json/TagesschauVideoDeserializer.java | 133 + .../tasks/TagesschauEnriesTask.java | 70 + .../tagesschau/tasks/TagesschauFilmTask.java | 238 + .../tasks/TagesschauOverviewTask.java | 80 + .../tagesschau/tasks/TagesschauVideoTask.java | 56 + .../json/TagesschauVideoDeserializerTest.java | 60 + .../tasks/TagesschauEntriesTaskTest.java | 59 + .../tasks/TagesschauOverviewTaskTest.java | 96 + .../tasks/TagesschauTaskTestBase.java | 22 + .../tasks/TagesschauVideoTaskTest.java | 64 + .../mserver/testhelper/JsoupMock.java | 4 +- .../mserver/testhelper/WireMockTestBase.java | 8 + .../tagesschau/tagesschau_20jahre_month.html | 6080 ++++++++++++++ .../tagesschau_20jahre_overview.html | 7356 +++++++++++++++++ .../tagesschau/tagesschau_20jahre_video.html | 5288 ++++++++++++ .../tagesschau/tagesschau_20jahre_video.json | 248 + .../tagesschau/tagesschau_20jahre_year.html | 6303 ++++++++++++++ 22 files changed, 26293 insertions(+), 3 deletions(-) create mode 100644 src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauConstants.java create mode 100644 src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java create mode 100644 src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java create mode 100644 src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEnriesTask.java create mode 100644 src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauFilmTask.java create mode 100644 src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTask.java create mode 100644 src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTask.java create mode 100644 src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java create mode 100644 src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTaskTest.java create mode 100644 src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTaskTest.java create mode 100644 src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauTaskTestBase.java create mode 100644 src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTaskTest.java create mode 100644 src/test/resources/tagesschau/tagesschau_20jahre_month.html create mode 100644 src/test/resources/tagesschau/tagesschau_20jahre_overview.html create mode 100644 src/test/resources/tagesschau/tagesschau_20jahre_video.html create mode 100644 src/test/resources/tagesschau/tagesschau_20jahre_video.json create mode 100644 src/test/resources/tagesschau/tagesschau_20jahre_year.html diff --git a/MServer-Config.yaml b/MServer-Config.yaml index d4684761d..f340cc696 100644 --- a/MServer-Config.yaml +++ b/MServer-Config.yaml @@ -27,7 +27,7 @@ maximumRequestsPerSecond: 999.0 # If set only these Sender will be crawled all other will be ignored. senderIncluded: - - ARD + # - ARD #- ARTE_DE #- ARTE_FR #- ARTE_PL @@ -41,6 +41,7 @@ senderIncluded: #- PHOENIX #- SRF #- SR + - TAGESSCHAU24 #- ZDF #SRF,SR,PHONIX,ORF,KIKA,DW,3SAT< diff --git a/pom.xml b/pom.xml index c56c407ac..a1db8df7c 100644 --- a/pom.xml +++ b/pom.xml @@ -279,6 +279,12 @@ ${mockito.version} test + + org.mockito + mockito-junit-jupiter + ${mockito.version} + test + org.hamcrest hamcrest diff --git a/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java b/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java index 194d469ad..b4f881b6a 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java +++ b/src/main/java/de/mediathekview/mserver/crawler/CrawlerManager.java @@ -1,5 +1,6 @@ package de.mediathekview.mserver.crawler; +import de.mediathekview.mserver.crawler.tagesschau.TagesschauCrawler; import de.mediathekview.mserver.daten.Film; import de.mediathekview.mserver.daten.Filmlist; import de.mediathekview.mserver.daten.Sender; @@ -547,6 +548,8 @@ private void initializeCrawler(final MServerConfigManager rootConfig) { Sender.SRF, new SrfCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig)); crawlerMap.put( Sender.SR, new SrCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig)); + crawlerMap.put( + Sender.TAGESSCHAU24, new TagesschauCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig)); crawlerMap.put( Sender.ZDF, new ZdfCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig)); } diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauConstants.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauConstants.java new file mode 100644 index 000000000..9bd3f3b0c --- /dev/null +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauConstants.java @@ -0,0 +1,27 @@ +package de.mediathekview.mserver.crawler.tagesschau; + +/** + * Constants for the Tagesschau crawler. + * Handles the "vor 20 Jahren" (20 years ago) archive with daily news broadcasts. + */ +public final class TagesschauConstants { + + // Starting point: Tagesschau vor 20 Jahren (20 years ago) + public static final String ARCHIVE_START_URL = "https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-142.html"; + + // Pattern for accessing specific month archives + // Example: /multimedia/sendung/ts/vor20jahren/ts-vor20jahren-20060401.html + public static final String ARCHIVE_DAY_URL_PATTERN = "https://www.tagesschau.de/multimedia/sendung/ts/vor20jahren/ts-vor20jahren-%s.html"; + + // Base URL for archive pages + public static final String ARCHIVE_MONTH_BASE = "https://www.tagesschau.de/multimedia/sendung/ts/vor20jahren/"; + + public static final String VIDEO_JSON = "https://zagent7.h-cdn.com/cmd/get_links_info?customer=ard_de&zone=gen&ver=1.165.211&url=https%3A%2F%2Fwww.tagesschau.de%2Fmultimedia%2Fsendung%2Ftagesschau_vor_20_jahren%2Fvideo-%s.html"; + + // Private constructor to hide the implicit public one + private TagesschauConstants() { + // Utility class, do not instantiate + } +} + + diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java new file mode 100644 index 000000000..d58b44b17 --- /dev/null +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java @@ -0,0 +1,92 @@ +package de.mediathekview.mserver.crawler.tagesschau; + +import de.mediathekview.mserver.daten.Film; +import de.mediathekview.mserver.daten.Sender; +import de.mediathekview.mserver.base.messages.listener.MessageListener; +import de.mediathekview.mserver.base.config.MServerConfigManager; +import de.mediathekview.mserver.base.messages.ServerMessages; +import de.mediathekview.mserver.crawler.basic.AbstractCrawler; +import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import de.mediathekview.mserver.crawler.tagesschau.tasks.TagesschauFilmTask; +import de.mediathekview.mserver.progress.listeners.SenderProgressListener; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.Collection; +import java.util.Queue; +import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.RecursiveTask; + +/** + * Crawler for the Tagesschau "vor 20 Jahren" (20 years ago) archive. + * Extracts daily news broadcasts from the archive. + */ +public class TagesschauCrawler extends AbstractCrawler { + + private static final Logger LOG = LogManager.getLogger(TagesschauCrawler.class); + + public TagesschauCrawler( + final ForkJoinPool aForkJoinPool, + final Collection aMessageListeners, + final Collection aProgressListeners, + final MServerConfigManager rootConfig) { + super(aForkJoinPool, aMessageListeners, aProgressListeners, rootConfig); + } + + @Override + public Sender getSender() { + return Sender.TAGESSCHAU24; + } + + @Override + protected RecursiveTask> createCrawlerTask() { + try { + // Create URLs for the last YEARS_BACK years + Queue filmUrls = createFilmUrls(); + + if (filmUrls.isEmpty()) { + LOG.warn("No URLs created for Tagesschau crawler"); + return null; + } + + // Set max count for progress tracking + getAndSetMaxCount(filmUrls.size()); + + printMessage( + ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, + getSender().getName(), + filmUrls.size()); + + // Return the task that will process the URLs + return new TagesschauFilmTask(this, filmUrls); + + } catch (final Exception ex) { + LOG.fatal("Exception in Tagesschau crawler.", ex); + printErrorMessage(); + } + return null; + } + + /** + * Creates URLs for the daily broadcast pages. + * We need to crawl through the years and generate URLs for each day. + */ + private Queue createFilmUrls() { + Queue urls = new ConcurrentLinkedQueue<>(); + + try { + // For now, we start by fetching the main archive page + // This page contains links to the individual days + urls.add(new CrawlerUrlDTO(TagesschauConstants.ARCHIVE_START_URL)); + + } catch (final Exception e) { + LOG.error("Error creating film URLs", e); + } + + return urls; + } +} + + diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java new file mode 100644 index 000000000..c473eb59f --- /dev/null +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java @@ -0,0 +1,133 @@ +package de.mediathekview.mserver.crawler.tagesschau.json; + +import com.google.gson.JsonDeserializationContext; +import com.google.gson.JsonDeserializer; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import de.mediathekview.mserver.base.utils.JsonUtils; +import de.mediathekview.mserver.crawler.basic.AbstractCrawler; +import de.mediathekview.mserver.daten.*; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.glassfish.jersey.message.internal.Quality; + +import java.lang.reflect.Type; +import java.net.MalformedURLException; +import java.time.Duration; +import java.time.LocalDateTime; +import java.time.OffsetDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.util.*; + +public class TagesschauVideoDeserializer implements JsonDeserializer> { + private static final String ELEMENT_MC = "mc"; + private static final String ELEMENT_MEDIA = "media"; + private static final String ELEMENT_META = "meta"; + private static final String ELEMENT_STREAMS = "streams"; + + private static final String ATTRIBUTE_DATE = "broadcastedOnDateTime"; + private static final String ATTRIBUTE_DURATION = "durationSeconds"; + private static final String ATTRIBUTE_TOPIC = "seriesTitle"; + private static final String ATTRIBUTE_TITLE = "title"; + + private static final String ATTRIBUTE_WIDTH = "maxHResolutionPx"; + private static final String ATTRIBUTE_MIMETYPE = "mimeType"; + private static final String ATTRIBUTE_URL = "url"; + private static final String[] SUPPORTED_MIME_TYPES = new String[] { "video/mp4" }; + + private static final DateTimeFormatter DATE_TIME_FORMATTER = + DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ssZ", Locale.GERMANY); // 2016-10-29T16:15:00+02:00 + private static final String GERMAN_TIME_ZONE = "Europe/Berlin"; + private static final Logger LOG = LogManager.getLogger(TagesschauVideoDeserializer.class); + private final AbstractCrawler crawler; + + public TagesschauVideoDeserializer(AbstractCrawler crawler) { + this.crawler = crawler; + } + + @Override + public List deserialize( + JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) { + + final List results = new ArrayList<>(); + + Optional mcElement = JsonUtils.getElement(jsonElement, ELEMENT_MC); + if (mcElement.isPresent()) { + final Optional metaElement = JsonUtils.getElement(mcElement.get(), ELEMENT_META); + if (metaElement.isPresent()) { + final JsonObject metaObject = metaElement.get().getAsJsonObject(); + final Optional topic = JsonUtils.getAttributeAsString(metaObject, ATTRIBUTE_TOPIC); + final Optional title = JsonUtils.getAttributeAsString(metaObject, ATTRIBUTE_TITLE); + final Optional duration = JsonUtils.getAttributeAsInt(metaObject, ATTRIBUTE_DURATION); + final Optional date = parseDate(metaObject); + final Map urls = parseUrls(mcElement.get().getAsJsonObject()); + + // TODO Prüfungen auf Topic+Titel + // TODO Zeitzone passt nicht + + final Film film = + new Film( + UUID.randomUUID(), + Sender.TAGESSCHAU24, + title.orElse(""), + topic.orElse(""), + date.get(), + duration.isEmpty() ? Duration.ofSeconds(0) : Duration.ofSeconds(duration.get())); + film.addGeolocation(GeoLocations.GEO_NONE); + + urls.forEach((resolution, url) -> { + try { + film.addUrl(resolution, new FilmUrl(url, crawler.determineFileSizeInKB(url))); + } catch (final MalformedURLException ex) { + LOG.error("InvalidUrl: {}", url, ex); + } + }); + + results.add(film); + } + } + + return results; + } + + private Map parseUrls(final JsonObject mcObject) { + // TODO robust machen gegen fehlende Elemente + final Map urls = new EnumMap(Resolution.class); + + mcObject.get(ELEMENT_STREAMS).getAsJsonArray().forEach(stream -> { + stream.getAsJsonObject().get(ELEMENT_MEDIA).getAsJsonArray().forEach(media -> { + final Optional mimeType = JsonUtils.getElementValueAsString(media, ATTRIBUTE_MIMETYPE); + if (mimeType.isPresent() && Arrays.stream(SUPPORTED_MIME_TYPES).anyMatch(type -> type.equals(mimeType.get()))) { + final Optional width = JsonUtils.getAttributeAsInt(media.getAsJsonObject(), ATTRIBUTE_WIDTH); + final Optional url = JsonUtils.getElementValueAsString(media, ATTRIBUTE_URL); + + if (width.isPresent() && url.isPresent()) { + final Resolution resolution = Resolution.getResolutionFromWidth(width.get()); + urls.put(resolution, url.get()); + } + } + }); + }); + + return urls; + } + + private static Optional parseDate(final JsonObject metaObject) { + final Optional dateValue = + JsonUtils.getAttributeAsString(metaObject, ATTRIBUTE_DATE); + if (dateValue.isPresent()) { + try { + final OffsetDateTime inputDateTime = OffsetDateTime.parse(dateValue.get(), DATE_TIME_FORMATTER); + final LocalDateTime localDateTime = inputDateTime.atZoneSameInstant(ZoneId.of(GERMAN_TIME_ZONE)).toLocalDateTime(); + return Optional.of(localDateTime); + } catch (final DateTimeParseException ex) { + LOG.error("Error parsing date time value {}", dateValue.get(), ex); + } + } + + return Optional.empty(); + } + +} diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEnriesTask.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEnriesTask.java new file mode 100644 index 000000000..2e24ce114 --- /dev/null +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEnriesTask.java @@ -0,0 +1,70 @@ +package de.mediathekview.mserver.crawler.tagesschau.tasks; + +import de.mediathekview.mserver.crawler.basic.AbstractCrawler; +import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask; +import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask; +import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import de.mediathekview.mserver.crawler.tagesschau.TagesschauConstants; + +import java.util.Arrays; +import java.util.Queue; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +public class TagesschauEnriesTask extends AbstractDocumentTask { + private static final Logger LOG = LogManager.getLogger(TagesschauEnriesTask.class); + + private static final String[] BLACKLIST = new String[] {TagesschauConstants.ARCHIVE_START_URL}; + + public TagesschauEnriesTask(final AbstractCrawler crawler, final Queue queue) { + super(crawler, queue); + } + + @Override + protected void processDocument(CrawlerUrlDTO aUrlDTO, Document aDocument) { + LOG.debug("Processing Tagesschau overview page: {}", aUrlDTO.getUrl()); + + // Find links that reference the "vor20jahren" archives. The page contains two + // kinds of URLs for year/overview pages, e.g.: + // - /multimedia/tsvorzwanzigjahren-472.html + // - /inland/tsvorzwanzigjahren-ts-100.html + final Elements links = aDocument.select(".teaser-absatz__link"); + + // Pattern to validate and capture the numeric id + final Pattern p = Pattern.compile(".*/video-\\d+\\.html$"); + + for (final Element link : links) { + try { + final String href = link.attr("href"); + if (href == null || href.isEmpty()) { + continue; + } + // normalize to absolute + final String fullUrl = href.startsWith("http") ? href : "https://www.tagesschau.de" + (href.startsWith("/") ? "" : "/") + href; + + final Matcher m = p.matcher(fullUrl); + if (m.find() && Arrays.stream(BLACKLIST).noneMatch(fullUrl::equalsIgnoreCase)) { + // Add the URL (deduplication is handled by the Set in taskResults) + taskResults.add(new CrawlerUrlDTO(fullUrl)); + crawler.incrementAndGetActualCount(); + } + } catch (final Exception e) { + LOG.debug("Error while processing overview link", e); + crawler.incrementAndGetErrorCount(); + } + } + + } + + @Override + protected AbstractRecursiveConverterTask createNewOwnInstance( + Queue aElementsToProcess) { + return new TagesschauEnriesTask(crawler, aElementsToProcess); + } +} diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauFilmTask.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauFilmTask.java new file mode 100644 index 000000000..3fcf21a3e --- /dev/null +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauFilmTask.java @@ -0,0 +1,238 @@ +package de.mediathekview.mserver.crawler.tagesschau.tasks; + +import de.mediathekview.mserver.daten.Film; +import de.mediathekview.mserver.daten.FilmUrl; +import de.mediathekview.mserver.daten.Resolution; +import de.mediathekview.mserver.crawler.basic.AbstractCrawler; +import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask; +import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask; +import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.net.MalformedURLException; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.Queue; +import java.util.UUID; + +/** + * Task for processing Tagesschau archive pages. + * Extracts links to daily broadcasts and creates Film objects. + */ +public class TagesschauFilmTask extends AbstractDocumentTask { + + private static final Logger LOG = LogManager.getLogger(TagesschauFilmTask.class); + private static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyyMMdd"); + + public TagesschauFilmTask( + final AbstractCrawler aCrawler, + final Queue aUrls) { + super(aCrawler, aUrls); + } + + @Override + protected void processDocument( + final CrawlerUrlDTO aUrlDTO, + final Document aDocument) { + try { + LOG.debug("Processing Tagesschau archive page: {}", aUrlDTO.getUrl()); + + // Look for video/broadcast links on the page + Elements teaserLinks = aDocument.select("a[href*='/multimedia/sendung/ts/vor20jahren']"); + + for (Element link : teaserLinks) { + try { + String href = link.attr("href"); + String title = link.select(".teaser-absatz__headline").text(); + String description = link.select(".teaser-absatz__shorttext").text(); + + if (!href.startsWith("/")) { + continue; + } + + // Make absolute URL + String fullUrl = "https://www.tagesschau.de" + href; + + // Try to extract date from URL + String dateStr = extractDateFromUrl(href); + + if (!title.isEmpty()) { + Film film = createFilmFromTeaser(title, description, dateStr, fullUrl); + if (film != null) { + taskResults.add(film); + crawler.incrementAndGetActualCount(); + } + } + + } catch (final Exception e) { + LOG.debug("Error parsing teaser link", e); + crawler.incrementAndGetErrorCount(); + } + } + + // If we found links, we're done + if (!taskResults.isEmpty()) { + LOG.debug("Found {} films on archive page", taskResults.size()); + return; + } + + // Otherwise, try to extract from video elements + Elements videoElements = aDocument.select("[data-js_component='video'], video, .video"); + + for (Element videoElem : videoElements) { + try { + Film film = parseVideoElement(videoElem); + if (film != null && !film.getUrls().isEmpty()) { + taskResults.add(film); + crawler.incrementAndGetActualCount(); + } + } catch (final Exception e) { + LOG.debug("Error parsing video element", e); + crawler.incrementAndGetErrorCount(); + } + } + + } catch (final Exception e) { + crawler.incrementAndGetErrorCount(); + LOG.error("Error processing document: {}", aUrlDTO.getUrl(), e); + } + } + + /** + * Creates a Film object from teaser information. + */ + private Film createFilmFromTeaser(String title, String description, String dateStr, String url) { + try { + Film film = new Film( + UUID.randomUUID(), + crawler.getSender(), + title.trim(), + "Tagesschau vor 20 Jahren", + dateStr != null ? LocalDate.parse(dateStr, DATE_FORMAT).atStartOfDay() : null, + null); + + if (!description.isEmpty()) { + film.setBeschreibung(description.trim()); + } + + // Add URL + try { + FilmUrl filmUrl = new FilmUrl(url, 0L); + film.addUrl(Resolution.HD, filmUrl); + } catch (final MalformedURLException e) { + LOG.warn("Invalid URL: {}", url, e); + return null; + } + + return film; + + } catch (final Exception e) { + LOG.debug("Error creating film from teaser", e); + return null; + } + } + + /** + * Parses a video element to extract Film information. + */ + private Film parseVideoElement(Element videoElem) { + try { + String title = ""; + Element titleElem = videoElem.selectFirst(".video-title, .headline, h3, h2"); + if (titleElem != null) { + title = titleElem.text(); + } + + if (title.isEmpty()) { + title = "Tagesschau Archiv"; + } + + Film film = new Film( + UUID.randomUUID(), + crawler.getSender(), + title, + "Tagesschau vor 20 Jahren", + null, + null); + + // Try to get description + Element descElem = videoElem.selectFirst(".description, .shorttext, p"); + if (descElem != null) { + String desc = descElem.text(); + if (!desc.isEmpty()) { + film.setBeschreibung(desc); + } + } + + // Try to get URL + String url = videoElem.attr("data-href"); + if (url.isEmpty()) { + Element linkElem = videoElem.selectFirst("a[href]"); + if (linkElem != null) { + url = linkElem.attr("href"); + } + } + if (url.isEmpty()) { + Element sourceElem = videoElem.selectFirst("source"); + if (sourceElem != null) { + url = sourceElem.attr("src"); + } + } + + if (!url.isEmpty()) { + if (!url.startsWith("http")) { + url = "https://www.tagesschau.de" + (url.startsWith("/") ? "" : "/") + url; + } + try { + FilmUrl filmUrl = new FilmUrl(url, 0L); + film.addUrl(Resolution.HD, filmUrl); + } catch (final MalformedURLException e) { + LOG.warn("Invalid URL: {}", url, e); + return null; + } + } + + return film; + + } catch (final Exception e) { + LOG.debug("Error parsing video element", e); + return null; + } + } + + /** + * Extracts date from URL in format yyyyMMdd. + * Example: ts-vor20jahren-20060401 -> 20060401 + */ + private String extractDateFromUrl(String url) { + try { + // Look for pattern like vor20jahren-20060401 + if (url.contains("vor20jahren-")) { + String[] parts = url.split("vor20jahren-"); + if (parts.length > 1) { + // Extract date part (should be 8 digits) + String datepart = parts[1].replaceAll("\\D", ""); + if (datepart.length() >= 8) { + return datepart.substring(0, 8); + } + } + } + } catch (final Exception e) { + LOG.debug("Could not extract date from URL: {}", url); + } + return null; + } + + @Override + protected AbstractRecursiveConverterTask createNewOwnInstance( + final Queue aElementsToProcess) { + return new TagesschauFilmTask(crawler, aElementsToProcess); + } +} + + + diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTask.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTask.java new file mode 100644 index 000000000..9a5baf959 --- /dev/null +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTask.java @@ -0,0 +1,80 @@ +package de.mediathekview.mserver.crawler.tagesschau.tasks; + +import de.mediathekview.mserver.crawler.basic.AbstractCrawler; +import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask; +import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask; +import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import de.mediathekview.mserver.crawler.tagesschau.TagesschauConstants; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +import java.util.Arrays; +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +import java.util.Queue; + +/** + * Overview task for Tagesschau archive pages. + * Reads an overview page and extracts URLs to daily archive pages (as CrawlerUrlDTO). + */ +public class TagesschauOverviewTask extends AbstractDocumentTask { + + private static final Logger LOG = LogManager.getLogger(TagesschauOverviewTask.class); + + private static final String[] BLACKLIST = new String[] { + TagesschauConstants.ARCHIVE_START_URL + }; + + public TagesschauOverviewTask(final AbstractCrawler aCrawler, final Queue aUrls) { + super(aCrawler, aUrls); + } + + @Override + protected void processDocument(final CrawlerUrlDTO aUrlDTO, final Document aDocument) { + LOG.debug("Processing Tagesschau overview page: {}", aUrlDTO.getUrl()); + + // Find links that reference the "vor20jahren" archives. The page contains two + // kinds of URLs for year/overview pages, e.g.: + // - /multimedia/tsvorzwanzigjahren-472.html + // - /inland/tsvorzwanzigjahren-ts-100.html + final Elements links = aDocument.select("a[href*='tsvorzwanzigjahren']"); + + // Pattern to validate and capture the numeric id or year slug at the end + final Pattern p = Pattern.compile(".*/(tsvorzwanzigjahren(?:-ts)?-?\\d+)\\.html$"); + + for (final Element link : links) { + try { + final String href = link.attr("href"); + if (href == null || href.isEmpty()) { + continue; + } + // normalize to absolute + final String fullUrl = href.startsWith("http") ? href : "https://www.tagesschau.de" + (href.startsWith("/") ? "" : "/") + href; + + final Matcher m = p.matcher(fullUrl); + if (m.find() && Arrays.stream(BLACKLIST).noneMatch(fullUrl::equalsIgnoreCase)) { + // Add the URL (deduplication is handled by the Set in taskResults) + taskResults.add(new CrawlerUrlDTO(fullUrl)); + crawler.incrementAndGetActualCount(); + } + } catch (final Exception e) { + LOG.debug("Error while processing overview link", e); + crawler.incrementAndGetErrorCount(); + } + } + } + + @Override + protected AbstractRecursiveConverterTask createNewOwnInstance( + final Queue aElementsToProcess) { + return new TagesschauOverviewTask(crawler, aElementsToProcess); + } +} + + + + diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTask.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTask.java new file mode 100644 index 000000000..78b9478ec --- /dev/null +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTask.java @@ -0,0 +1,56 @@ +package de.mediathekview.mserver.crawler.tagesschau.tasks; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.reflect.TypeToken; +import de.mediathekview.mserver.crawler.basic.AbstractCrawler; +import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask; +import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask; +import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import de.mediathekview.mserver.crawler.tagesschau.json.TagesschauVideoDeserializer; +import de.mediathekview.mserver.daten.Film; +import java.lang.reflect.Type; +import java.util.List; +import java.util.Queue; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jsoup.nodes.Document; + +public class TagesschauVideoTask extends AbstractDocumentTask { + private static final String DESCRIPTOR_MEDIA_PLAYER = "div[data-v-type=MediaPlayer]"; + private static final Logger LOG = LogManager.getLogger(TagesschauVideoTask.class); + + private static final Type FILM_TYPE_TOKEN = new TypeToken>() {}.getType(); + + public TagesschauVideoTask(AbstractCrawler crawler, Queue queue) { + super(crawler, queue); + } + + @Override + protected void processDocument(CrawlerUrlDTO aUrlDTO, Document aDocument) { + final Gson gson = + new GsonBuilder() + .registerTypeAdapter(FILM_TYPE_TOKEN, new TagesschauVideoDeserializer(crawler)) + .create(); + + aDocument + .select(DESCRIPTOR_MEDIA_PLAYER) + .forEach( + element -> { + try { + String json = element.attr("data-v"); + final List films = gson.fromJson(json, FILM_TYPE_TOKEN); + taskResults.addAll(films); + crawler.incrementAndGetActualCount(); + } catch (Exception e) { + crawler.incrementAndGetErrorCount(); + } + }); + } + + @Override + protected AbstractRecursiveConverterTask createNewOwnInstance( + Queue aElementsToProcess) { + return new TagesschauVideoTask(crawler, aElementsToProcess); + } +} diff --git a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java new file mode 100644 index 000000000..29a8ca7e4 --- /dev/null +++ b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java @@ -0,0 +1,60 @@ +package de.mediathekview.mserver.crawler.tagesschau.json; + +import static org.junit.jupiter.api.Assertions.*; + +import com.google.gson.JsonElement; +import de.mediathekview.mserver.base.config.MServerConfigManager; +import de.mediathekview.mserver.base.messages.listener.MessageListener; +import de.mediathekview.mserver.crawler.ard.ArdCrawler; +import de.mediathekview.mserver.crawler.tagesschau.TagesschauCrawler; +import de.mediathekview.mserver.daten.Film; +import de.mediathekview.mserver.daten.GeoLocations; +import de.mediathekview.mserver.daten.Sender; +import de.mediathekview.mserver.progress.listeners.SenderProgressListener; +import de.mediathekview.mserver.testhelper.AssertFilm; +import de.mediathekview.mserver.testhelper.JsonFileReader; +import java.time.Duration; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.ForkJoinPool; + +import org.junit.jupiter.api.Test; + +class TagesschauVideoDeserializerTest { + protected MServerConfigManager rootConfig = new MServerConfigManager("MServer-JUnit-Config.yaml"); + + @Test + void test() { + + final JsonElement jsonElement = + JsonFileReader.readJson("/tagesschau/tagesschau_20jahre_video.json"); + TagesschauVideoDeserializer target = new TagesschauVideoDeserializer(createCrawler()); + final List actual = target.deserialize(jsonElement, null, null); + assertNotNull(actual); + assertEquals(1, actual.size()); + AssertFilm.assertEquals( + actual.getFirst(), + Sender.TAGESSCHAU24, + "tagesschau vor 20 Jahren", + "tagesschau vor 20 Jahren, 30. Januar 2006", + LocalDateTime.of(2006, 1, 30, 20, 0, 0), + Duration.ofSeconds(937), + "", + "https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547686.html", + new GeoLocations[] {GeoLocations.GEO_NONE}, + "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webm.h264.mp4", + "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webxl.h264.mp4", + "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webxxl.h264.mp4", + ""); + } + + protected TagesschauCrawler createCrawler() { + final ForkJoinPool forkJoinPool = new ForkJoinPool(); + final Collection nachrichten = new ArrayList<>(); + final Collection fortschritte = new ArrayList<>(); + + return new TagesschauCrawler(forkJoinPool, nachrichten, fortschritte, rootConfig); + } +} diff --git a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTaskTest.java b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTaskTest.java new file mode 100644 index 000000000..8fa80a3be --- /dev/null +++ b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTaskTest.java @@ -0,0 +1,59 @@ +package de.mediathekview.mserver.crawler.tagesschau.tasks; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.*; + +import de.mediathekview.mserver.base.webaccess.JsoupConnection; +import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import de.mediathekview.mserver.crawler.tagesschau.TagesschauCrawler; +import de.mediathekview.mserver.testhelper.JsoupMock; +import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; +import org.hamcrest.Matchers; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +class TagesschauEntriesTaskTest extends TagesschauTaskTestBase { + + @Mock JsoupConnection jsoupConnection; + + @BeforeEach + void setUp() { + MockitoAnnotations.openMocks(this); + } + + @Test + void testMonth() { + final String requestUrl = "http://tagesschau-month.de"; + final CrawlerUrlDTO[] expectedUrls = + new CrawlerUrlDTO[] { + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547694.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547686.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547682.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547680.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547676.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1539874.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1539872.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1539870.html") + }; + + jsoupConnection = + JsoupMock.mock( + requestUrl, "/tagesschau/tagesschau_20jahre_month.html"); + final TagesschauCrawler crawler = createCrawler(); + crawler.setConnection(jsoupConnection); + + final ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(); + queue.add(new CrawlerUrlDTO(requestUrl)); + + final TagesschauEnriesTask target = new TagesschauEnriesTask(crawler, queue); + final Set actual = target.invoke(); + assertEquals(expectedUrls.length, actual.size()); + assertThat(actual, Matchers.containsInAnyOrder(expectedUrls)); + } +} diff --git a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTaskTest.java b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTaskTest.java new file mode 100644 index 000000000..1ff8954de --- /dev/null +++ b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTaskTest.java @@ -0,0 +1,96 @@ +package de.mediathekview.mserver.crawler.tagesschau.tasks; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +import de.mediathekview.mserver.base.webaccess.JsoupConnection; +import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import de.mediathekview.mserver.crawler.tagesschau.TagesschauCrawler; +import de.mediathekview.mserver.testhelper.JsoupMock; +import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; +import org.hamcrest.Matchers; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +class TagesschauOverviewTaskTest extends TagesschauTaskTestBase { + + @Mock JsoupConnection jsoupConnection; + + @BeforeEach + void setUp() { + MockitoAnnotations.openMocks(this); + } + + @Test + void test() { + final String requestUrl = "http://tagesschau-overview.de"; + final CrawlerUrlDTO[] expectedUrls = + new CrawlerUrlDTO[] { + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-478.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-472.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-442.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-416.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-387.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-359.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-327.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-301.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-257.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-221.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-183.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-147.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-121.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-136.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-116.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-100.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-106.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-104.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-102.html") + }; + + jsoupConnection = + JsoupMock.mock( + requestUrl, "/tagesschau/tagesschau_20jahre_overview.html"); + final TagesschauCrawler crawler = createCrawler(); + crawler.setConnection(jsoupConnection); + + final ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(); + queue.add(new CrawlerUrlDTO(requestUrl)); + + final TagesschauOverviewTask target = new TagesschauOverviewTask(crawler, queue); + final Set actual = target.invoke(); + assertEquals(expectedUrls.length, actual.size()); + assertThat(actual, Matchers.containsInAnyOrder(expectedUrls)); + } + + @Test + void testYear() { + final String requestUrl = "http://tagesschau-year.de"; + final CrawlerUrlDTO[] expectedUrls = + new CrawlerUrlDTO[] { + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-476.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-474.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-468.html") + }; + + jsoupConnection = + JsoupMock.mock( + requestUrl, "/tagesschau/tagesschau_20jahre_year.html"); + final TagesschauCrawler crawler = createCrawler(); + crawler.setConnection(jsoupConnection); + + final ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(); + queue.add(new CrawlerUrlDTO(requestUrl)); + + final TagesschauOverviewTask target = new TagesschauOverviewTask(crawler, queue); + final Set actual = target.invoke(); + assertEquals(expectedUrls.length, actual.size()); + assertThat(actual, Matchers.containsInAnyOrder(expectedUrls)); + } +} diff --git a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauTaskTestBase.java b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauTaskTestBase.java new file mode 100644 index 000000000..06fce0b29 --- /dev/null +++ b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauTaskTestBase.java @@ -0,0 +1,22 @@ +package de.mediathekview.mserver.crawler.tagesschau.tasks; + +import de.mediathekview.mserver.base.config.MServerConfigManager; +import de.mediathekview.mserver.base.messages.listener.MessageListener; +import de.mediathekview.mserver.crawler.tagesschau.TagesschauCrawler; +import de.mediathekview.mserver.progress.listeners.SenderProgressListener; +import de.mediathekview.mserver.testhelper.WireMockTestBase; +import java.util.ArrayList; +import java.util.Collection; +import java.util.concurrent.ForkJoinPool; + +public class TagesschauTaskTestBase extends WireMockTestBase { + + protected MServerConfigManager rootConfig = new MServerConfigManager("MServer-JUnit-Config.yaml"); + + protected TagesschauCrawler createCrawler() { + final ForkJoinPool forkJoinPool = new ForkJoinPool(); + final Collection nachrichten = new ArrayList<>(); + final Collection fortschritte = new ArrayList<>(); + return new TagesschauCrawler(forkJoinPool, nachrichten, fortschritte, rootConfig); + } +} diff --git a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTaskTest.java b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTaskTest.java new file mode 100644 index 000000000..dedc33f46 --- /dev/null +++ b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTaskTest.java @@ -0,0 +1,64 @@ +package de.mediathekview.mserver.crawler.tagesschau.tasks; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.junit.jupiter.api.Assertions.*; + +import de.mediathekview.mserver.base.webaccess.JsoupConnection; +import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import de.mediathekview.mserver.crawler.tagesschau.TagesschauCrawler; +import de.mediathekview.mserver.daten.Film; +import de.mediathekview.mserver.daten.GeoLocations; +import de.mediathekview.mserver.daten.Sender; +import de.mediathekview.mserver.testhelper.AssertFilm; +import de.mediathekview.mserver.testhelper.JsoupMock; + +import java.time.Duration; +import java.time.LocalDateTime; +import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; +import org.hamcrest.Matchers; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.mockito.junit.jupiter.MockitoExtension; + +@ExtendWith(MockitoExtension.class) +class TagesschauVideoTaskTest extends TagesschauTaskTestBase { + + @Mock JsoupConnection jsoupConnection; + + @BeforeEach + void setUp() { + MockitoAnnotations.openMocks(this); + } + + @Test + void testVideo() { + final String requestUrl = "http://tagesschau-month.de"; + + jsoupConnection = + JsoupMock.mock( + requestUrl, "/tagesschau/tagesschau_20jahre_video.html"); + final TagesschauCrawler crawler = createCrawler(); + crawler.setConnection(jsoupConnection); + + final ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(); + queue.add(new CrawlerUrlDTO(requestUrl)); + + final TagesschauVideoTask target = new TagesschauVideoTask(crawler, queue); + final Set actual = target.invoke(); + assertEquals(1, actual.size()); + AssertFilm.assertEquals( + actual.iterator().next(), + Sender.TAGESSCHAU24, + "tagesschau vor 20 Jahren", + "30. Januar 2006", + LocalDateTime.of(2006, 1, 30, 20, 0, 0), + Duration.ofMinutes(15).plusSeconds(37), + "", + "https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547686.html", + new GeoLocations[] {GeoLocations.GEO_NONE}, "", "", "", ""); + } +} diff --git a/src/test/java/de/mediathekview/mserver/testhelper/JsoupMock.java b/src/test/java/de/mediathekview/mserver/testhelper/JsoupMock.java index 219acb969..57e635634 100644 --- a/src/test/java/de/mediathekview/mserver/testhelper/JsoupMock.java +++ b/src/test/java/de/mediathekview/mserver/testhelper/JsoupMock.java @@ -52,9 +52,9 @@ public static JsoupConnection mockWithTextModifications( final Document document = Jsoup.parse(fileContent); final Document XmlDocument = Jsoup.parse(fileContent, url, Parser.xmlParser()); - Mockito.when(connection.requestBodyAsString(url)).thenReturn(fileContent); + Mockito.lenient().when(connection.requestBodyAsString(url)).thenReturn(fileContent); Mockito.when(connection.requestBodyAsHtmlDocument(url)).thenReturn(document); - Mockito.when(connection.requestBodyAsXmlDocument(org.mockito.Mockito.eq(url))) + Mockito.lenient().when(connection.requestBodyAsXmlDocument(org.mockito.Mockito.eq(url))) .thenReturn(XmlDocument); } catch (final IOException ioException) { LogManager.getLogger(JsoupMock.class) diff --git a/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java b/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java index 4c9b4d7b5..e30d60878 100644 --- a/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java +++ b/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java @@ -9,6 +9,12 @@ import org.junit.Before; import jakarta.annotation.Nullable; +import org.junit.BeforeClass; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; + import java.util.Optional; import java.util.Queue; import java.util.concurrent.ConcurrentLinkedQueue; @@ -23,6 +29,7 @@ public abstract class WireMockTestBase { private boolean wireMockStarted = false; @Before + @BeforeEach public void setUpClass() { LOG.info("Setting up WireMock test class"); startWireMock(); @@ -39,6 +46,7 @@ protected synchronized void startWireMock() { } @After + @AfterEach public void tearDownClass() { LOG.info("Tear down WireMock test class"); LOG.info("Stopping WireMock"); diff --git a/src/test/resources/tagesschau/tagesschau_20jahre_month.html b/src/test/resources/tagesschau/tagesschau_20jahre_month.html new file mode 100644 index 000000000..3373acd74 --- /dev/null +++ b/src/test/resources/tagesschau/tagesschau_20jahre_month.html @@ -0,0 +1,6080 @@ + + + + + + + + + + + + + + + + + + + tagesschau vor 20 Jahren - der Januar 2006 | tagesschau.de + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + +
+ + + + + +
+
+ + + + +
+ + + + + + + + + + + + + + +
+ + + + +
+
+ + + +
+ + + + + + +
+ +
+ + + + + + + + + + + + + +
+ + + + +
+ +
+
+
+
+ + + + + + + + + + +
+ + + + + +
+ + + + + + +
+ + + + + + + + + + tagesschau-Logo 1997 - 2005 + +
+ +
+ + + +
+ + + +
+ +

+ Video-Rückblick + Januar 2006 +

+

Stand: 31.01.2026 • 06:03 Uhr

+
+ + + + + + + + + + +

+ Die tagesschau vor 20 Jahren - alle Ausgaben aus dem Januar 2006 +

+ + + + + +
+
+ + + + + + + + + + + + + + + + +

tagesschau.de dokumentiert das Jahr 2006 Tag für Tag mit den 20-Uhr-Ausgaben der Tagesschau.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + + + + + + + + + + + + + + + + +
+
+ +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + +
+ + + + + + + + + + + +
+ +
+ + + + +
+
+
+ + + + + + + + +
+ +
+ + + + + + + + + + + + + +
+ + + + + + + + + + +
+ + diff --git a/src/test/resources/tagesschau/tagesschau_20jahre_overview.html b/src/test/resources/tagesschau/tagesschau_20jahre_overview.html new file mode 100644 index 000000000..9c1b94334 --- /dev/null +++ b/src/test/resources/tagesschau/tagesschau_20jahre_overview.html @@ -0,0 +1,7356 @@ + + + + + + + + + + + + + + + + + + + Zeitgeschichte: Die tagesschau vor 20 Jahren | tagesschau.de + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + +
+ + + + + +
+
+ + + + +
+ + + + + + + + + + + + + + +
+ + + + +
+
+ + + +
+ + + + + + +
+ +
+ + + + + + + + + + + + + +
+ + + + +
+ +
+
+
+
+ + + + + + + + + + +
+ + + + + +
+ + + + + + +
+ + + + + + + + + + tagesschau-Logo 1997 - 2005 + +
+ +
+ + + +
+ + + +
+ +

+ Zeitgeschichte + tagesschau vor 20 Jahren +

+

Stand: 24.04.2026 • 06:52 Uhr

+
+ + + + + + + + + + +

+ tagesschau vor 20 Jahren: Wie sah die Welt vor 20 Jahren aus? Alle 20-Uhr-Sendungen der tagesschau seit 1989 im Nachrichtenrückblick. +

+ + + + + +
+
+ + + + + + + + + + + + + + + + +

Wie sah die Welt vor 20 Jahren aus? Welche Themen bestimmten die politische Debatte? Wie wurden die Probleme bewertet? Wer war damals wichtig? Die tagesschau bietet einen wertvollen Einblick in die jüngere Zeitgeschichte.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Die tagesschau vor 20 Jahren dokumentiert alle 20-Uhr-Ausgaben Tag für Tag im Nachrichtenrückblick.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Tagesschau-Ausgaben im April 2006

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

Die Jahre 1989-2006

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + +
+ + + + + + + + + + + +
+ +
+ + + + +
+
+
+ + + + + + + + +
+ +
s + + + + + + + + + + + + + +
+ + + + + + + + + + +
+ + diff --git a/src/test/resources/tagesschau/tagesschau_20jahre_video.html b/src/test/resources/tagesschau/tagesschau_20jahre_video.html new file mode 100644 index 000000000..7db1ec2dc --- /dev/null +++ b/src/test/resources/tagesschau/tagesschau_20jahre_video.html @@ -0,0 +1,5288 @@ + + + + + + + + + + + + + + + + + + + tagesschau vor 20 Jahren, 30. Januar 2006 | tagesschau.de + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + +
+ + + + + +
+
+ + + + +
+ + + + + + + + + + + + + + +
+ + + + +
+
+ + + +
+ + + + + + +
+ +
+ + + + + + + + + + + + + +
+ + + + +
+ +
+
+
+
+
+ +
+ +
+ + + +
+
+ + Sendung + +
+

+ + tagesschau vor 20 Jahren, 30. Januar 2006 +

+
Stand: 22.01.2026 • 13:38 Uhr
+
+

tagesschau vor 20 Jahren, 30. Januar 2006

+
+
+ +
+ + + + + +
+ + + + + + + + + + Sendungsbild + +
+ + + + + + + + + +
+
+ + + + +
+
+ tagesschau vor 20 Jahren, Das Erste, 30.01.2006 • 20:00 Uhr +
+
+
+ +
+ + + + + + +
+ + + + + + + + + + + +
+ + +
+ + + + +
+
+
+ + + + + + + +
+ +
+ + + + + + + + + + + + + +
+ + + + + + + + + + +
+ + diff --git a/src/test/resources/tagesschau/tagesschau_20jahre_video.json b/src/test/resources/tagesschau/tagesschau_20jahre_video.json new file mode 100644 index 000000000..554520562 --- /dev/null +++ b/src/test/resources/tagesschau/tagesschau_20jahre_video.json @@ -0,0 +1,248 @@ +{ + "pc" : { + "web" : { + "baseUrl" : "/resources/assets/js/vendor/ardplayer/", + "isForcedAutoPlay" : false, + "isEnablePostMessage" : false, + "isForcedVideoView" : true, + "disableBackButtonTitle" : true, + "disablePosterTitle" : true, + "disablePosterImage" : true + }, + "generic" : { + "imageTemplateConfig" : { + "size" : [ { + "minWidth" : 0, + "value" : "AAABnSSvsdE/16x9-small" + }, { + "minWidth" : 568, + "value" : "AAABnSSvrFg/16x9-big" + } ], + "width" : { + "min" : 320, + "max" : 1920, + "stepSize" : 320 + } + }, + "isAutoplay" : false + }, + "pluginData" : { } + }, + "mc" : { + "streams" : [ { + "media" : [ { + "url" : "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webs.h264.mp4", + "mimeType" : "video/mp4", + "fallbackGroup" : 9, + "maxHResolutionPx" : 480, + "audios" : [ { + "kind" : "standard", + "languageCode" : "de" + } ] + }, { + "url" : "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webm.h264.mp4", + "mimeType" : "video/mp4", + "fallbackGroup" : 6, + "maxHResolutionPx" : 640, + "audios" : [ { + "kind" : "standard", + "languageCode" : "de" + } ] + }, { + "url" : "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webl.h264.mp4", + "mimeType" : "video/mp4", + "fallbackGroup" : 4, + "maxHResolutionPx" : 960, + "audios" : [ { + "kind" : "standard", + "languageCode" : "de" + } ] + }, { + "url" : "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webxl.h264.mp4", + "mimeType" : "video/mp4", + "fallbackGroup" : 2, + "maxHResolutionPx" : 1280, + "audios" : [ { + "kind" : "standard", + "languageCode" : "de" + } ] + }, { + "url" : "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webxxl.h264.mp4", + "mimeType" : "video/mp4", + "fallbackGroup" : 1, + "maxHResolutionPx" : 1920, + "audios" : [ { + "kind" : "standard", + "languageCode" : "de" + } ] + }, { + "url" : "https://adaptive.tagesschau.de/i/video/2026/0122/TV-20260122-1304-0500,.webs.h264.mp4,.webl.h264.mp4,.webxl.h264.mp4,.webxxl.h264.mp4,.webm.h264.mp4,.csmil/master.m3u8", + "mimeType" : "application/vnd.apple.mpegurl", + "audios" : [ { + "kind" : "standard", + "languageCode" : "de" + } ], + "isAdaptiveQualitySelectable" : true + } ], + "kind" : "main", + "isAudioOnly" : false + } ], + "meta" : { + "images" : [ { + "url" : "https://images.tagesschau.de/image/e4798de6-ff1d-49ed-827e-334b743c6961/AAABm-W2WN8/{size}/sendungsbild-1674792.webp?width={width}", + "kind" : "preview", + "title" : "Sendungsbild | ARD-aktuell", + "alt" : "Sendungsbild" + }, { + "url" : "https://images.tagesschau.de/image/e4798de6-ff1d-49ed-827e-334b743c6961/AAABm-W2WN8/{size}/sendungsbild-1674792.jpg?width={width}", + "kind" : "preview", + "title" : "Sendungsbild | ARD-aktuell", + "alt" : "Sendungsbild" + } ], + "title" : "tagesschau vor 20 Jahren, 30. Januar 2006", + "seriesTitle" : "tagesschau vor 20 Jahren", + "durationSeconds" : 937, + "broadcastedOnDateTime" : "2006-01-30T19:00:00+0000", + "showBroadcastedOnWithTime" : true, + "synopsis" : "tagesschau vor 20 Jahren, 30. Januar 2006" + }, + "pluginData" : { + "sharing@web" : { + "link" : "https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547686.html", + "body" : "Meine Empfehlung: „tagesschau vor 20 Jahren, 30. Januar 2006” \nhttps://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547686.html", + "subject" : "Entdeckt auf tagesschau.de: tagesschau vor 20 Jahren, 30. Januar 2006", + "disableSubclipping" : true, + "services" : [ "url", "email", "whatsapp", "facebook", "fb-messenger", "twitter", "telegram", "threema", "embed" ], + "embedCode" : "", + "params" : "?startTime=$start$&endTime=$ende$", + "legal" : "Durch die Einbettung von ARD-Videos auf Ihrer Webseite stimmen Sie den ARD Nutzungsbedingungen zu.", + "embedDialogTitle" : "Inhalt einbetten" + }, + "download@web" : { + "isEnabled" : true, + "sources" : [ { + "title" : "SD 270p", + "url" : "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webs.h264.mp4?download=true" + }, { + "title" : "SD 360p", + "url" : "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webm.h264.mp4?download=true" + }, { + "title" : "SD 540p", + "url" : "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webl.h264.mp4?download=true" + }, { + "title" : "HD 720p", + "url" : "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webxl.h264.mp4?download=true" + }, { + "title" : "HD 1080p", + "url" : "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webxxl.h264.mp4?download=true" + }, { + "title" : "SD 540p", + "url" : "https://tagesschau-podcast.ard-mcdn.de/audio/2026/0122/TV-20260122-1304-0500.mp3?download=true" + } ] + }, + "trackingPiano@all" : { + "config" : { + "dimensionTransform" : { + "av_autoplay" : "av_auto_mode" + }, + "events" : [ "av.speed", "av.quality", "av.error", "av.volume", "av.volume.mute", "av.share", "av.dialog.open", "av.dialog.close", "av.playermode", "av.jumpmark", "av.subtitle", "av.language", "av.audiodescription", "av.signlanguage", "av.set.sleeptimer", "av.playlist", "av.embed.click", "av.embed.title", "av.recommendation" ] + }, + "avContent" : { + "atiTagConfig" : { + "site" : 595936, + "collectDomain" : "vqggwrz.pa-cd.com", + "addEventURL" : "true" + }, + "s:tgp_page_chapter1" : "multimedia", + "s:tgp_page_chapter2" : "sendung", + "s:tgp_page_chapter3" : "tagesschau_vor_20_jahren", + "s:technical_tagging_guide" : "Unified", + "s:site_level2" : "www.tagesschau.de", + "s:tgp_product_platform" : "Web", + "b:product_login" : true, + "content_authors" : [ "tagesschau" ], + "s:page" : "tagesschau vor 20 Jahren, 30. Januar 2006", + "b:tgp_page_contains_video" : true, + "b:tgp_page_contains_audio" : false, + "s:tgp_content_object_type" : "Video", + "d:tgp_content_publication_time" : "2026-01-22T13:38:44Z", + "s:tgp_content_external_id" : "7539dc1b-9778-44f8-b93b-44f16ee02bb7", + "s:tgp_content_id" : "video-1547686", + "s:tgp_content_content_type" : "Sendung", + "d:tgp_content_last_editorial_update" : "2026-01-22T13:38:44Z", + "a:s:tgp_content_keywords" : [ "Tagesschau vor 20 Jahren" ], + "s:tgp_content_creator" : "tagesschau", + "s:tgp_content_creator_institution" : "tagesschau", + "s:tgp_content_broadcast_station" : "Das Erste", + "d:tgp_content_online_since" : "2026-01-22T13:38:44Z", + "s:tgp_content_show" : "tagesschau vor 20 Jahren", + "s:tgp_content_show_id" : "tsv20", + "b:tgp_content_broadcast_reference" : true, + "n:tgp_content_days_since_publication" : 94, + "n:tgp_content_days_since_online" : 94, + "a:s:tgp_content_external_tags" : [ "Tagesschau vor 20 Jahren" ], + "s:tgp_content_teaser_image_url" : "https://images.tagesschau.de/image/e4798de6-ff1d-49ed-827e-334b743c6961/AAABm-W2WN8/AAABnR8VW9w/original/sendungsbild-1674792.jpg", + "s:tgp_content_url_path" : "/multimedia/sendung/tagesschau_vor_20_jahren/video-1547686.html", + "b:tgp_content_author_visible" : false, + "n:av_content_duration" : 937000, + "s:av_content" : "tagesschau vor 20 Jahren, 30. Januar 2006", + "av_content_id" : "video-1547686", + "s:av_show" : "tagesschau vor 20 Jahren", + "s:av_content_type" : "Video", + "s:av_broadcasting_type" : "OnDemand", + "d:av_original_air_time" : "2006-01-30T20:00:00Z", + "b:av_web_only" : false, + "b:av_full_show" : true, + "s:av_player" : "ARD Player", + "s:av_content_creator" : "tagesschau", + "s:av_content_external_id" : "7539dc1b-9778-44f8-b93b-44f16ee02bb7", + "s:av_institution" : "tagesschau", + "s:av_publisher" : "Das Erste", + "s:av_show_id" : "tsv20", + "s:av_content_url" : "https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547686.html", + "d:av_online_since" : "2026-01-22T13:38:44Z", + "d:av_publication_time" : "2026-01-22T13:38:44Z" + }, + "isEnabled" : true + }, + "trackingAgf@all" : { + "appId" : "PE6FF1BB7-FE88-4674-B083-2772ADAD55E9", + "playerID" : "video-1547686", + "clipData" : { + "type" : "content", + "assetid" : "video-1547686_0", + "program" : "tagesschau vor 20 Jahren", + "length" : "937", + "title" : "Das Erste_tagesschau vor 20 Jahren_tagesschau vor 20 Jahren, 30. Januar 2006_2026.01.22 13:38:44", + "nol_c0" : "p0,0", + "nol_c2" : "p2,N", + "nol_c5" : "p5,https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547686.html", + "nol_c7" : "p7,video-1547686", + "nol_c9" : "p9,tagesschau vor 20 Jahren_tagesschau vor 20 Jahren, 30. Januar 2006_2026.01.22 13:38:44", + "nol_c10" : "p10,Das Erste", + "nol_c12" : "p12,Content", + "nol_c16" : "p16,ARD_Information", + "nol_c18" : "p18,N" + }, + "sfcode" : "eu", + "prod" : "vc", + "apn" : "ardplayer", + "agfMetaDataSDK" : { + "censuscategory" : "Das Erste_tagesschau vor 20 Jahren_tagesschau vor 20 Jahren, 30. Januar 2006_2026.01.22 13:38:44", + "livestream" : "no" + } + } + } + }, + "mediadescription" : "tagesschau vor 20 Jahren, 30. Januar 2006", + "playerType" : "video", + "ratio" : "16x9", + "context" : "avdetail", + "posterImage" : { + "altText" : "Sendungsbild", + "title" : "Sendungsbild | ARD-aktuell", + "urlS" : "https://images.tagesschau.de/image/e4798de6-ff1d-49ed-827e-334b743c6961/AAABm-W2WN8/AAABnSSvrFg/16x9-big/sendungsbild-1674792.webp?width=640", + "urlM" : "https://images.tagesschau.de/image/e4798de6-ff1d-49ed-827e-334b743c6961/AAABm-W2WN8/AAABnSSvrFg/16x9-big/sendungsbild-1674792.webp?width=768", + "urlL" : "https://images.tagesschau.de/image/e4798de6-ff1d-49ed-827e-334b743c6961/AAABm-W2WN8/AAABnSSvrFg/16x9-big/sendungsbild-1674792.webp?width=1280" + } +} \ No newline at end of file diff --git a/src/test/resources/tagesschau/tagesschau_20jahre_year.html b/src/test/resources/tagesschau/tagesschau_20jahre_year.html new file mode 100644 index 000000000..0e51be0a7 --- /dev/null +++ b/src/test/resources/tagesschau/tagesschau_20jahre_year.html @@ -0,0 +1,6303 @@ + + + + + + + + + + + + + + + + + + + tagesschau vor 20 Jahren: 2006 | tagesschau.de + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + +
+ + + + + +
+
+ + + + +
+ + + + + + + + + + + + + + +
+ + + + +
+
+ + + +
+ + + + + + +
+ +
+ + + + + + + + + + + + + +
+ + + + +
+ +
+
+
+
+ + + + + + + + + + +
+ + + + + +
+ + + + + + +
+ + + + + + + + + + tagesschau-Logo 1997 - 2005 + +
+ +
+ + + +
+ + + +
+ +

+ Video-Rückblick + Nachrichten aus dem Jahr 2006 +

+

Stand: 01.04.2026 • 11:47 Uhr

+
+ + + + + + + + + + +

+ Nachrichten-Rückblick auf das Jahr 2006: Zeitgeschichte und historische Ereignisse - Hier können Sie sie in der tagesschau vor 20 Jahren noch einmal sehen. +

+ + + + + +
+
+ + + + + + + + + + + + + + + + +

tagesschau.de dokumentiert das Jahr 2006 Tag für Tag mit den 20-Uhr-Ausgaben der Tagesschau.

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + +
+ + +
+ + + + + + + + + + + +
+ +
+ + + + +
+
+
+ + + + + + + + +
+ +
+ + + + + + + + + + + + + +
+ + + + + + + + + + +
+ + From c13ab680fb027dcc909c32c6864bdc1b3fb7c729 Mon Sep 17 00:00:00 2001 From: pidoubleyou <22942659+pidoubleyou@users.noreply.github.com> Date: Tue, 12 May 2026 22:23:21 +0200 Subject: [PATCH 02/10] zweiter Zwischenstand --- .../crawler/tagesschau/TagesschauCrawler.java | 52 ++-- .../json/TagesschauVideoDeserializer.java | 19 +- .../tagesschau/tasks/TagesschauFilmTask.java | 238 ------------------ .../json/TagesschauVideoDeserializerTest.java | 3 +- .../tasks/TagesschauVideoTaskTest.java | 15 +- 5 files changed, 46 insertions(+), 281 deletions(-) delete mode 100644 src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauFilmTask.java diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java index d58b44b17..58ce95065 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java @@ -1,5 +1,8 @@ package de.mediathekview.mserver.crawler.tagesschau; +import de.mediathekview.mserver.crawler.tagesschau.tasks.TagesschauEnriesTask; +import de.mediathekview.mserver.crawler.tagesschau.tasks.TagesschauOverviewTask; +import de.mediathekview.mserver.crawler.tagesschau.tasks.TagesschauVideoTask; import de.mediathekview.mserver.daten.Film; import de.mediathekview.mserver.daten.Sender; import de.mediathekview.mserver.base.messages.listener.MessageListener; @@ -7,7 +10,6 @@ import de.mediathekview.mserver.base.messages.ServerMessages; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; -import de.mediathekview.mserver.crawler.tagesschau.tasks.TagesschauFilmTask; import de.mediathekview.mserver.progress.listeners.SenderProgressListener; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -19,10 +21,6 @@ import java.util.concurrent.ForkJoinPool; import java.util.concurrent.RecursiveTask; -/** - * Crawler for the Tagesschau "vor 20 Jahren" (20 years ago) archive. - * Extracts daily news broadcasts from the archive. - */ public class TagesschauCrawler extends AbstractCrawler { private static final Logger LOG = LogManager.getLogger(TagesschauCrawler.class); @@ -43,24 +41,29 @@ public Sender getSender() { @Override protected RecursiveTask> createCrawlerTask() { try { - // Create URLs for the last YEARS_BACK years - Queue filmUrls = createFilmUrls(); + Queue archiveUrl = createArchiveUrl(); - if (filmUrls.isEmpty()) { - LOG.warn("No URLs created for Tagesschau crawler"); - return null; - } + TagesschauOverviewTask overviewTask = new TagesschauOverviewTask(this, archiveUrl); + final Set overviewResults = this.forkJoinPool.submit(overviewTask).get(); - // Set max count for progress tracking - getAndSetMaxCount(filmUrls.size()); + // TODO nur für den aktuellen Monat passt die Logik + // für alle anderen Einträge muss rekursive OverviewTask genutzt werden, bis die Monatsseite erreicht ist + + LOG.debug("Overview task completed. Found {} overview URLs.", overviewResults.size()); + + TagesschauEnriesTask entriesTask = new TagesschauEnriesTask(this, new ConcurrentLinkedQueue<>(overviewResults)); + final Set entriesResults = this.forkJoinPool.submit(entriesTask).get(); + + LOG.debug("Entries task completed. Found {} entry URLs.", entriesResults.size()); + + getAndSetMaxCount(entriesResults.size()); printMessage( ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), - filmUrls.size()); + entriesResults.size()); - // Return the task that will process the URLs - return new TagesschauFilmTask(this, filmUrls); + return new TagesschauVideoTask(this, new ConcurrentLinkedQueue<>(entriesResults)); } catch (final Exception ex) { LOG.fatal("Exception in Tagesschau crawler.", ex); @@ -69,22 +72,9 @@ protected RecursiveTask> createCrawlerTask() { return null; } - /** - * Creates URLs for the daily broadcast pages. - * We need to crawl through the years and generate URLs for each day. - */ - private Queue createFilmUrls() { + private Queue createArchiveUrl() { Queue urls = new ConcurrentLinkedQueue<>(); - - try { - // For now, we start by fetching the main archive page - // This page contains links to the individual days - urls.add(new CrawlerUrlDTO(TagesschauConstants.ARCHIVE_START_URL)); - - } catch (final Exception e) { - LOG.error("Error creating film URLs", e); - } - + urls.add(new CrawlerUrlDTO(TagesschauConstants.ARCHIVE_START_URL)); return urls; } } diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java index c473eb59f..c50c592c1 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java @@ -9,10 +9,10 @@ import de.mediathekview.mserver.daten.*; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.glassfish.jersey.message.internal.Quality; import java.lang.reflect.Type; import java.net.MalformedURLException; +import java.net.URI; import java.time.Duration; import java.time.LocalDateTime; import java.time.OffsetDateTime; @@ -25,6 +25,8 @@ public class TagesschauVideoDeserializer implements JsonDeserializer> private static final String ELEMENT_MC = "mc"; private static final String ELEMENT_MEDIA = "media"; private static final String ELEMENT_META = "meta"; + private static final String ELEMENT_PLUG_IN_DATA = "pluginData"; + private static final String ELEMENT_SHARING_WEB = "sharing@web"; private static final String ELEMENT_STREAMS = "streams"; private static final String ATTRIBUTE_DATE = "broadcastedOnDateTime"; @@ -35,6 +37,7 @@ public class TagesschauVideoDeserializer implements JsonDeserializer> private static final String ATTRIBUTE_WIDTH = "maxHResolutionPx"; private static final String ATTRIBUTE_MIMETYPE = "mimeType"; private static final String ATTRIBUTE_URL = "url"; + private static final String ATTRIBUTE_LINK = "link"; private static final String[] SUPPORTED_MIME_TYPES = new String[] { "video/mp4" }; private static final DateTimeFormatter DATE_TIME_FORMATTER = @@ -63,6 +66,7 @@ public List deserialize( final Optional duration = JsonUtils.getAttributeAsInt(metaObject, ATTRIBUTE_DURATION); final Optional date = parseDate(metaObject); final Map urls = parseUrls(mcElement.get().getAsJsonObject()); + final String website = parseWebsite(mcElement.get().getAsJsonObject()); // TODO Prüfungen auf Topic+Titel // TODO Zeitzone passt nicht @@ -76,6 +80,13 @@ public List deserialize( date.get(), duration.isEmpty() ? Duration.ofSeconds(0) : Duration.ofSeconds(duration.get())); film.addGeolocation(GeoLocations.GEO_NONE); + if (!website.isEmpty()) { + try { + film.setWebsite(URI.create(website).toURL()); + } catch (MalformedURLException e) { + LOG.error("Invalid website URL: {}", website, e); + } + } urls.forEach((resolution, url) -> { try { @@ -92,9 +103,13 @@ public List deserialize( return results; } + private String parseWebsite(JsonObject mcObject) { + return JsonUtils.getElementValueAsString(mcObject, ELEMENT_PLUG_IN_DATA, ELEMENT_SHARING_WEB, ATTRIBUTE_LINK).orElse(""); + } + private Map parseUrls(final JsonObject mcObject) { // TODO robust machen gegen fehlende Elemente - final Map urls = new EnumMap(Resolution.class); + final Map urls = new EnumMap<>(Resolution.class); mcObject.get(ELEMENT_STREAMS).getAsJsonArray().forEach(stream -> { stream.getAsJsonObject().get(ELEMENT_MEDIA).getAsJsonArray().forEach(media -> { diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauFilmTask.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauFilmTask.java deleted file mode 100644 index 3fcf21a3e..000000000 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauFilmTask.java +++ /dev/null @@ -1,238 +0,0 @@ -package de.mediathekview.mserver.crawler.tagesschau.tasks; - -import de.mediathekview.mserver.daten.Film; -import de.mediathekview.mserver.daten.FilmUrl; -import de.mediathekview.mserver.daten.Resolution; -import de.mediathekview.mserver.crawler.basic.AbstractCrawler; -import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask; -import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask; -import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -import java.net.MalformedURLException; -import java.time.LocalDate; -import java.time.format.DateTimeFormatter; -import java.util.Queue; -import java.util.UUID; - -/** - * Task for processing Tagesschau archive pages. - * Extracts links to daily broadcasts and creates Film objects. - */ -public class TagesschauFilmTask extends AbstractDocumentTask { - - private static final Logger LOG = LogManager.getLogger(TagesschauFilmTask.class); - private static final DateTimeFormatter DATE_FORMAT = DateTimeFormatter.ofPattern("yyyyMMdd"); - - public TagesschauFilmTask( - final AbstractCrawler aCrawler, - final Queue aUrls) { - super(aCrawler, aUrls); - } - - @Override - protected void processDocument( - final CrawlerUrlDTO aUrlDTO, - final Document aDocument) { - try { - LOG.debug("Processing Tagesschau archive page: {}", aUrlDTO.getUrl()); - - // Look for video/broadcast links on the page - Elements teaserLinks = aDocument.select("a[href*='/multimedia/sendung/ts/vor20jahren']"); - - for (Element link : teaserLinks) { - try { - String href = link.attr("href"); - String title = link.select(".teaser-absatz__headline").text(); - String description = link.select(".teaser-absatz__shorttext").text(); - - if (!href.startsWith("/")) { - continue; - } - - // Make absolute URL - String fullUrl = "https://www.tagesschau.de" + href; - - // Try to extract date from URL - String dateStr = extractDateFromUrl(href); - - if (!title.isEmpty()) { - Film film = createFilmFromTeaser(title, description, dateStr, fullUrl); - if (film != null) { - taskResults.add(film); - crawler.incrementAndGetActualCount(); - } - } - - } catch (final Exception e) { - LOG.debug("Error parsing teaser link", e); - crawler.incrementAndGetErrorCount(); - } - } - - // If we found links, we're done - if (!taskResults.isEmpty()) { - LOG.debug("Found {} films on archive page", taskResults.size()); - return; - } - - // Otherwise, try to extract from video elements - Elements videoElements = aDocument.select("[data-js_component='video'], video, .video"); - - for (Element videoElem : videoElements) { - try { - Film film = parseVideoElement(videoElem); - if (film != null && !film.getUrls().isEmpty()) { - taskResults.add(film); - crawler.incrementAndGetActualCount(); - } - } catch (final Exception e) { - LOG.debug("Error parsing video element", e); - crawler.incrementAndGetErrorCount(); - } - } - - } catch (final Exception e) { - crawler.incrementAndGetErrorCount(); - LOG.error("Error processing document: {}", aUrlDTO.getUrl(), e); - } - } - - /** - * Creates a Film object from teaser information. - */ - private Film createFilmFromTeaser(String title, String description, String dateStr, String url) { - try { - Film film = new Film( - UUID.randomUUID(), - crawler.getSender(), - title.trim(), - "Tagesschau vor 20 Jahren", - dateStr != null ? LocalDate.parse(dateStr, DATE_FORMAT).atStartOfDay() : null, - null); - - if (!description.isEmpty()) { - film.setBeschreibung(description.trim()); - } - - // Add URL - try { - FilmUrl filmUrl = new FilmUrl(url, 0L); - film.addUrl(Resolution.HD, filmUrl); - } catch (final MalformedURLException e) { - LOG.warn("Invalid URL: {}", url, e); - return null; - } - - return film; - - } catch (final Exception e) { - LOG.debug("Error creating film from teaser", e); - return null; - } - } - - /** - * Parses a video element to extract Film information. - */ - private Film parseVideoElement(Element videoElem) { - try { - String title = ""; - Element titleElem = videoElem.selectFirst(".video-title, .headline, h3, h2"); - if (titleElem != null) { - title = titleElem.text(); - } - - if (title.isEmpty()) { - title = "Tagesschau Archiv"; - } - - Film film = new Film( - UUID.randomUUID(), - crawler.getSender(), - title, - "Tagesschau vor 20 Jahren", - null, - null); - - // Try to get description - Element descElem = videoElem.selectFirst(".description, .shorttext, p"); - if (descElem != null) { - String desc = descElem.text(); - if (!desc.isEmpty()) { - film.setBeschreibung(desc); - } - } - - // Try to get URL - String url = videoElem.attr("data-href"); - if (url.isEmpty()) { - Element linkElem = videoElem.selectFirst("a[href]"); - if (linkElem != null) { - url = linkElem.attr("href"); - } - } - if (url.isEmpty()) { - Element sourceElem = videoElem.selectFirst("source"); - if (sourceElem != null) { - url = sourceElem.attr("src"); - } - } - - if (!url.isEmpty()) { - if (!url.startsWith("http")) { - url = "https://www.tagesschau.de" + (url.startsWith("/") ? "" : "/") + url; - } - try { - FilmUrl filmUrl = new FilmUrl(url, 0L); - film.addUrl(Resolution.HD, filmUrl); - } catch (final MalformedURLException e) { - LOG.warn("Invalid URL: {}", url, e); - return null; - } - } - - return film; - - } catch (final Exception e) { - LOG.debug("Error parsing video element", e); - return null; - } - } - - /** - * Extracts date from URL in format yyyyMMdd. - * Example: ts-vor20jahren-20060401 -> 20060401 - */ - private String extractDateFromUrl(String url) { - try { - // Look for pattern like vor20jahren-20060401 - if (url.contains("vor20jahren-")) { - String[] parts = url.split("vor20jahren-"); - if (parts.length > 1) { - // Extract date part (should be 8 digits) - String datepart = parts[1].replaceAll("\\D", ""); - if (datepart.length() >= 8) { - return datepart.substring(0, 8); - } - } - } - } catch (final Exception e) { - LOG.debug("Could not extract date from URL: {}", url); - } - return null; - } - - @Override - protected AbstractRecursiveConverterTask createNewOwnInstance( - final Queue aElementsToProcess) { - return new TagesschauFilmTask(crawler, aElementsToProcess); - } -} - - - diff --git a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java index 29a8ca7e4..b496466b2 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java @@ -5,7 +5,6 @@ import com.google.gson.JsonElement; import de.mediathekview.mserver.base.config.MServerConfigManager; import de.mediathekview.mserver.base.messages.listener.MessageListener; -import de.mediathekview.mserver.crawler.ard.ArdCrawler; import de.mediathekview.mserver.crawler.tagesschau.TagesschauCrawler; import de.mediathekview.mserver.daten.Film; import de.mediathekview.mserver.daten.GeoLocations; @@ -45,8 +44,8 @@ void test() { "https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547686.html", new GeoLocations[] {GeoLocations.GEO_NONE}, "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webm.h264.mp4", + "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webl.h264.mp4", "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webxl.h264.mp4", - "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webxxl.h264.mp4", ""); } diff --git a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTaskTest.java b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTaskTest.java index dedc33f46..4991b784e 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTaskTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTaskTest.java @@ -1,6 +1,5 @@ package de.mediathekview.mserver.crawler.tagesschau.tasks; -import static org.hamcrest.MatcherAssert.assertThat; import static org.junit.jupiter.api.Assertions.*; import de.mediathekview.mserver.base.webaccess.JsoupConnection; @@ -11,12 +10,10 @@ import de.mediathekview.mserver.daten.Sender; import de.mediathekview.mserver.testhelper.AssertFilm; import de.mediathekview.mserver.testhelper.JsoupMock; - import java.time.Duration; import java.time.LocalDateTime; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; -import org.hamcrest.Matchers; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; @@ -38,9 +35,7 @@ void setUp() { void testVideo() { final String requestUrl = "http://tagesschau-month.de"; - jsoupConnection = - JsoupMock.mock( - requestUrl, "/tagesschau/tagesschau_20jahre_video.html"); + jsoupConnection = JsoupMock.mock(requestUrl, "/tagesschau/tagesschau_20jahre_video.html"); final TagesschauCrawler crawler = createCrawler(); crawler.setConnection(jsoupConnection); @@ -54,11 +49,15 @@ void testVideo() { actual.iterator().next(), Sender.TAGESSCHAU24, "tagesschau vor 20 Jahren", - "30. Januar 2006", + "tagesschau vor 20 Jahren, 30. Januar 2006", LocalDateTime.of(2006, 1, 30, 20, 0, 0), Duration.ofMinutes(15).plusSeconds(37), "", "https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-1547686.html", - new GeoLocations[] {GeoLocations.GEO_NONE}, "", "", "", ""); + new GeoLocations[] {GeoLocations.GEO_NONE}, + "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webm.h264.mp4", + "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webl.h264.mp4", + "https://tagesschau-progressive.ard-mcdn.de/video/2026/0122/TV-20260122-1304-0500.webxl.h264.mp4", + ""); } } From d694e5f0f6222e82a66414dfc746449ca7a4d6f5 Mon Sep 17 00:00:00 2001 From: pidoubleyou <22942659+pidoubleyou@users.noreply.github.com> Date: Wed, 20 May 2026 21:10:29 +0200 Subject: [PATCH 03/10] recursive search --- .../crawler/tagesschau/EntryUrlDto.java | 32 ++ .../crawler/tagesschau/TagesschauCrawler.java | 54 +-- .../json/TagesschauVideoDeserializer.java | 145 ++++--- ...esTask.java => TagesschauEntriesTask.java} | 40 +- .../tasks/TagesschauOverviewTask.java | 80 ---- .../tagesschau/tasks/TagesschauVideoTask.java | 1 + .../json/TagesschauVideoDeserializerTest.java | 13 +- .../tasks/TagesschauEntriesTaskTest.java | 80 +++- .../tasks/TagesschauOverviewTaskTest.java | 96 ----- .../tagesschau_20jahre_video_date.json | 371 ++++++++++++++++++ 10 files changed, 642 insertions(+), 270 deletions(-) create mode 100644 src/main/java/de/mediathekview/mserver/crawler/tagesschau/EntryUrlDto.java rename src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/{TagesschauEnriesTask.java => TagesschauEntriesTask.java} (55%) delete mode 100644 src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTask.java delete mode 100644 src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTaskTest.java create mode 100644 src/test/resources/tagesschau/tagesschau_20jahre_video_date.json diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/EntryUrlDto.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/EntryUrlDto.java new file mode 100644 index 000000000..6710533e7 --- /dev/null +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/EntryUrlDto.java @@ -0,0 +1,32 @@ +package de.mediathekview.mserver.crawler.tagesschau; + +import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; + +import java.util.HashSet; +import java.util.Set; + +public class EntryUrlDto { + private final Set videos; + private final Set subPages; + + public EntryUrlDto() { + this.videos = new HashSet<>(); + this.subPages = new HashSet<>(); + } + + public Set getVideos() { + return videos; + } + + public Set getSubPages() { + return subPages; + } + + public void addVideo(CrawlerUrlDTO videoUrl) { + this.videos.add(videoUrl); + } + + public void addSubPage(CrawlerUrlDTO subPageUrl) { + this.subPages.add(subPageUrl); + } +} \ No newline at end of file diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java index 58ce95065..4aac38fa4 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java @@ -1,7 +1,6 @@ package de.mediathekview.mserver.crawler.tagesschau; -import de.mediathekview.mserver.crawler.tagesschau.tasks.TagesschauEnriesTask; -import de.mediathekview.mserver.crawler.tagesschau.tasks.TagesschauOverviewTask; +import de.mediathekview.mserver.crawler.tagesschau.tasks.TagesschauEntriesTask; import de.mediathekview.mserver.crawler.tagesschau.tasks.TagesschauVideoTask; import de.mediathekview.mserver.daten.Film; import de.mediathekview.mserver.daten.Sender; @@ -15,9 +14,11 @@ import org.apache.logging.log4j.Logger; import java.util.Collection; +import java.util.HashSet; import java.util.Queue; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.ExecutionException; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.RecursiveTask; @@ -41,33 +42,42 @@ public Sender getSender() { @Override protected RecursiveTask> createCrawlerTask() { try { - Queue archiveUrl = createArchiveUrl(); - - TagesschauOverviewTask overviewTask = new TagesschauOverviewTask(this, archiveUrl); - final Set overviewResults = this.forkJoinPool.submit(overviewTask).get(); - - // TODO nur für den aktuellen Monat passt die Logik - // für alle anderen Einträge muss rekursive OverviewTask genutzt werden, bis die Monatsseite erreicht ist - - LOG.debug("Overview task completed. Found {} overview URLs.", overviewResults.size()); - - TagesschauEnriesTask entriesTask = new TagesschauEnriesTask(this, new ConcurrentLinkedQueue<>(overviewResults)); - final Set entriesResults = this.forkJoinPool.submit(entriesTask).get(); - - LOG.debug("Entries task completed. Found {} entry URLs.", entriesResults.size()); - - getAndSetMaxCount(entriesResults.size()); + Set videos = new HashSet<>(); + Queue inputQueue = createArchiveUrl(); + + // short run uses 2 recursion -> only the actual month is included + int recursionMax = Boolean.TRUE.equals(crawlerConfig.getTopicsSearchEnabled()) ? 10 : 2; + int recursionCount = 0; + + while (!inputQueue.isEmpty() && recursionCount < recursionMax) { + LOG.debug("processing {} sub pages", inputQueue.size()); + TagesschauEntriesTask round1 = new TagesschauEntriesTask(this, inputQueue); + final Set results = this.forkJoinPool.submit(round1).get(); + + Set subPages = new HashSet<>(); + results.forEach( + result -> { + videos.addAll(result.getVideos()); + subPages.addAll(result.getSubPages()); + }); + inputQueue = new ConcurrentLinkedQueue<>(subPages); + recursionCount++; + } + + getAndSetMaxCount(videos.size()); printMessage( ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), - entriesResults.size()); + videos.size()); - return new TagesschauVideoTask(this, new ConcurrentLinkedQueue<>(entriesResults)); + return new TagesschauVideoTask(this, new ConcurrentLinkedQueue<>(videos)); - } catch (final Exception ex) { + } catch (final InterruptedException ex) { + LOG.fatal("Exception in Tagesschau crawler.", ex); + Thread.currentThread().interrupt(); + } catch (final ExecutionException ex) { LOG.fatal("Exception in Tagesschau crawler.", ex); - printErrorMessage(); } return null; } diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java index c50c592c1..e9c512d07 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java @@ -7,19 +7,18 @@ import de.mediathekview.mserver.base.utils.JsonUtils; import de.mediathekview.mserver.crawler.basic.AbstractCrawler; import de.mediathekview.mserver.daten.*; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - import java.lang.reflect.Type; import java.net.MalformedURLException; import java.net.URI; -import java.time.Duration; -import java.time.LocalDateTime; -import java.time.OffsetDateTime; -import java.time.ZoneId; +import java.time.*; import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatterBuilder; import java.time.format.DateTimeParseException; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; public class TagesschauVideoDeserializer implements JsonDeserializer> { private static final String ELEMENT_MC = "mc"; @@ -40,6 +39,20 @@ public class TagesschauVideoDeserializer implements JsonDeserializer> private static final String ATTRIBUTE_LINK = "link"; private static final String[] SUPPORTED_MIME_TYPES = new String[] { "video/mp4" }; + private static final Pattern LONG_MONTH_PATTERN = + Pattern.compile("(\\d{1,2}(?:\\.|\\s)\\s*[A-Za-zÄÖÜäöüß]+\\s+\\d{4})"); + private static final DateTimeFormatter GERMAN_LONG = new DateTimeFormatterBuilder() + .parseCaseInsensitive() + .appendPattern("d. MMMM uuuu") + .toFormatter(Locale.GERMAN); + private static final DateTimeFormatter GERMAN_LONG_NO_SPACE = new DateTimeFormatterBuilder() + .parseCaseInsensitive() + .appendPattern("d.MMMM uuuu") + .toFormatter(Locale.GERMAN); + private static final DateTimeFormatter GERMAN_LONG_NO_DOT = new DateTimeFormatterBuilder() + .parseCaseInsensitive() + .appendPattern("d MMMM uuuu") + .toFormatter(Locale.GERMAN); private static final DateTimeFormatter DATE_TIME_FORMATTER = DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ssZ", Locale.GERMANY); // 2016-10-29T16:15:00+02:00 private static final String GERMAN_TIME_ZONE = "Europe/Berlin"; @@ -50,6 +63,25 @@ public TagesschauVideoDeserializer(AbstractCrawler crawler) { this.crawler = crawler; } + private static Optional parseDate(final JsonObject metaObject, final Optional titleDate) { + final Optional dateValue = + JsonUtils.getAttributeAsString(metaObject, ATTRIBUTE_DATE); + if (dateValue.isPresent()) { + try { + final OffsetDateTime inputDateTime = OffsetDateTime.parse(dateValue.get(), DATE_TIME_FORMATTER); + LocalDateTime localDateTime = inputDateTime.atZoneSameInstant(ZoneId.of(GERMAN_TIME_ZONE)).toLocalDateTime(); + if (titleDate.isPresent() && titleDate.get().getYear() != localDateTime.getYear()) { + localDateTime = localDateTime.withYear(titleDate.get().getYear()); + } + return Optional.of(localDateTime); + } catch (final DateTimeParseException ex) { + LOG.error("Error parsing date time value {}", dateValue.get(), ex); + } + } + + return titleDate.map(localDate -> LocalDateTime.of(localDate, LocalTime.of(20, 0))); + } + @Override public List deserialize( JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) { @@ -64,20 +96,17 @@ public List deserialize( final Optional topic = JsonUtils.getAttributeAsString(metaObject, ATTRIBUTE_TOPIC); final Optional title = JsonUtils.getAttributeAsString(metaObject, ATTRIBUTE_TITLE); final Optional duration = JsonUtils.getAttributeAsInt(metaObject, ATTRIBUTE_DURATION); - final Optional date = parseDate(metaObject); + final Optional date = parseDate(metaObject, parseDateFromTitle(title.orElse(""))); final Map urls = parseUrls(mcElement.get().getAsJsonObject()); final String website = parseWebsite(mcElement.get().getAsJsonObject()); - // TODO Prüfungen auf Topic+Titel - // TODO Zeitzone passt nicht - final Film film = new Film( UUID.randomUUID(), Sender.TAGESSCHAU24, title.orElse(""), topic.orElse(""), - date.get(), + date.orElse(LocalDateTime.now()), duration.isEmpty() ? Duration.ofSeconds(0) : Duration.ofSeconds(duration.get())); film.addGeolocation(GeoLocations.GEO_NONE); if (!website.isEmpty()) { @@ -103,46 +132,70 @@ public List deserialize( return results; } + private Optional parseDateFromTitle(String title) { + Matcher m = LONG_MONTH_PATTERN.matcher(title); + if (m.find()) { + String datePart = m.group(1).replaceAll("\\s+", " ").trim(); + try { + return Optional.of(LocalDate.parse(datePart, GERMAN_LONG)); + } catch (DateTimeParseException ignored) { + // try other conversion + } + try { + return Optional.of(LocalDate.parse(datePart, GERMAN_LONG_NO_SPACE)); + } catch (DateTimeParseException ignored) { + // try other conversion + } + try { + return Optional.of(LocalDate.parse(datePart, GERMAN_LONG_NO_DOT)); + } catch (DateTimeParseException ex) { + LOG.warn("no valid date converted", ex); + } + } + return Optional.empty(); + } + private String parseWebsite(JsonObject mcObject) { return JsonUtils.getElementValueAsString(mcObject, ELEMENT_PLUG_IN_DATA, ELEMENT_SHARING_WEB, ATTRIBUTE_LINK).orElse(""); } private Map parseUrls(final JsonObject mcObject) { - // TODO robust machen gegen fehlende Elemente final Map urls = new EnumMap<>(Resolution.class); - - mcObject.get(ELEMENT_STREAMS).getAsJsonArray().forEach(stream -> { - stream.getAsJsonObject().get(ELEMENT_MEDIA).getAsJsonArray().forEach(media -> { - final Optional mimeType = JsonUtils.getElementValueAsString(media, ATTRIBUTE_MIMETYPE); - if (mimeType.isPresent() && Arrays.stream(SUPPORTED_MIME_TYPES).anyMatch(type -> type.equals(mimeType.get()))) { - final Optional width = JsonUtils.getAttributeAsInt(media.getAsJsonObject(), ATTRIBUTE_WIDTH); - final Optional url = JsonUtils.getElementValueAsString(media, ATTRIBUTE_URL); - - if (width.isPresent() && url.isPresent()) { - final Resolution resolution = Resolution.getResolutionFromWidth(width.get()); - urls.put(resolution, url.get()); - } - } - }); - }); - - return urls; - } - - private static Optional parseDate(final JsonObject metaObject) { - final Optional dateValue = - JsonUtils.getAttributeAsString(metaObject, ATTRIBUTE_DATE); - if (dateValue.isPresent()) { - try { - final OffsetDateTime inputDateTime = OffsetDateTime.parse(dateValue.get(), DATE_TIME_FORMATTER); - final LocalDateTime localDateTime = inputDateTime.atZoneSameInstant(ZoneId.of(GERMAN_TIME_ZONE)).toLocalDateTime(); - return Optional.of(localDateTime); - } catch (final DateTimeParseException ex) { - LOG.error("Error parsing date time value {}", dateValue.get(), ex); - } + if (mcObject.has(ELEMENT_STREAMS) && mcObject.get(ELEMENT_STREAMS).isJsonArray()) { + mcObject + .get(ELEMENT_STREAMS) + .getAsJsonArray() + .forEach( + stream -> { + final JsonObject streamObject = stream.getAsJsonObject(); + if (streamObject.has(ELEMENT_MEDIA) + && streamObject.get(ELEMENT_MEDIA).isJsonArray()) { + streamObject + .get(ELEMENT_MEDIA) + .getAsJsonArray() + .forEach( + media -> { + final Optional mimeType = + JsonUtils.getElementValueAsString(media, ATTRIBUTE_MIMETYPE); + if (mimeType.isPresent() + && Arrays.stream(SUPPORTED_MIME_TYPES) + .anyMatch(type -> type.equals(mimeType.get()))) { + final Optional width = + JsonUtils.getAttributeAsInt( + media.getAsJsonObject(), ATTRIBUTE_WIDTH); + final Optional url = + JsonUtils.getElementValueAsString(media, ATTRIBUTE_URL); + + if (width.isPresent() && url.isPresent()) { + final Resolution resolution = + Resolution.getResolutionFromWidth(width.get()); + urls.put(resolution, url.get()); + } + } + }); + } + }); } - - return Optional.empty(); + return urls; } - } diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEnriesTask.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTask.java similarity index 55% rename from src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEnriesTask.java rename to src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTask.java index 2e24ce114..9264c8428 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEnriesTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTask.java @@ -4,6 +4,7 @@ import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask; import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask; import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import de.mediathekview.mserver.crawler.tagesschau.EntryUrlDto; import de.mediathekview.mserver.crawler.tagesschau.TagesschauConstants; import java.util.Arrays; @@ -17,54 +18,53 @@ import org.jsoup.nodes.Element; import org.jsoup.select.Elements; -public class TagesschauEnriesTask extends AbstractDocumentTask { - private static final Logger LOG = LogManager.getLogger(TagesschauEnriesTask.class); +public class TagesschauEntriesTask extends AbstractDocumentTask { + private static final Logger LOG = LogManager.getLogger(TagesschauEntriesTask.class); + private static final Pattern PATTERN_VIDEO = Pattern.compile(".*/video-\\d+\\.html$"); + private static final Pattern PATTERN_SUB_PAGE = Pattern.compile(".*/(tsvorzwanzigjahren(?:-ts)?-?\\d+)\\.html$"); private static final String[] BLACKLIST = new String[] {TagesschauConstants.ARCHIVE_START_URL}; - public TagesschauEnriesTask(final AbstractCrawler crawler, final Queue queue) { + public TagesschauEntriesTask(final AbstractCrawler crawler, final Queue queue) { super(crawler, queue); } @Override protected void processDocument(CrawlerUrlDTO aUrlDTO, Document aDocument) { + EntryUrlDto result = new EntryUrlDto(); LOG.debug("Processing Tagesschau overview page: {}", aUrlDTO.getUrl()); - // Find links that reference the "vor20jahren" archives. The page contains two - // kinds of URLs for year/overview pages, e.g.: - // - /multimedia/tsvorzwanzigjahren-472.html - // - /inland/tsvorzwanzigjahren-ts-100.html final Elements links = aDocument.select(".teaser-absatz__link"); - // Pattern to validate and capture the numeric id - final Pattern p = Pattern.compile(".*/video-\\d+\\.html$"); - for (final Element link : links) { try { final String href = link.attr("href"); - if (href == null || href.isEmpty()) { + if (href.isEmpty()) { continue; } // normalize to absolute final String fullUrl = href.startsWith("http") ? href : "https://www.tagesschau.de" + (href.startsWith("/") ? "" : "/") + href; - final Matcher m = p.matcher(fullUrl); - if (m.find() && Arrays.stream(BLACKLIST).noneMatch(fullUrl::equalsIgnoreCase)) { - // Add the URL (deduplication is handled by the Set in taskResults) - taskResults.add(new CrawlerUrlDTO(fullUrl)); - crawler.incrementAndGetActualCount(); + if (Arrays.stream(BLACKLIST).noneMatch(fullUrl::equalsIgnoreCase)) { + final Matcher matcherSubPage = PATTERN_SUB_PAGE.matcher(fullUrl); + final Matcher matcherVideo = PATTERN_VIDEO.matcher(fullUrl); + if (matcherSubPage.find()) { + result.addSubPage(new CrawlerUrlDTO(fullUrl)); + } else if (matcherVideo.find()) { + result.addVideo(new CrawlerUrlDTO(fullUrl)); + } } } catch (final Exception e) { - LOG.debug("Error while processing overview link", e); - crawler.incrementAndGetErrorCount(); + LOG.error("Error while processing overview link", e); } } + taskResults.add(result); } @Override - protected AbstractRecursiveConverterTask createNewOwnInstance( + protected AbstractRecursiveConverterTask createNewOwnInstance( Queue aElementsToProcess) { - return new TagesschauEnriesTask(crawler, aElementsToProcess); + return new TagesschauEntriesTask(crawler, aElementsToProcess); } } diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTask.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTask.java deleted file mode 100644 index 9a5baf959..000000000 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTask.java +++ /dev/null @@ -1,80 +0,0 @@ -package de.mediathekview.mserver.crawler.tagesschau.tasks; - -import de.mediathekview.mserver.crawler.basic.AbstractCrawler; -import de.mediathekview.mserver.crawler.basic.AbstractDocumentTask; -import de.mediathekview.mserver.crawler.basic.AbstractRecursiveConverterTask; -import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; -import de.mediathekview.mserver.crawler.tagesschau.TagesschauConstants; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.select.Elements; - -import java.util.Arrays; -import java.util.regex.Pattern; -import java.util.regex.Matcher; - -import java.util.Queue; - -/** - * Overview task for Tagesschau archive pages. - * Reads an overview page and extracts URLs to daily archive pages (as CrawlerUrlDTO). - */ -public class TagesschauOverviewTask extends AbstractDocumentTask { - - private static final Logger LOG = LogManager.getLogger(TagesschauOverviewTask.class); - - private static final String[] BLACKLIST = new String[] { - TagesschauConstants.ARCHIVE_START_URL - }; - - public TagesschauOverviewTask(final AbstractCrawler aCrawler, final Queue aUrls) { - super(aCrawler, aUrls); - } - - @Override - protected void processDocument(final CrawlerUrlDTO aUrlDTO, final Document aDocument) { - LOG.debug("Processing Tagesschau overview page: {}", aUrlDTO.getUrl()); - - // Find links that reference the "vor20jahren" archives. The page contains two - // kinds of URLs for year/overview pages, e.g.: - // - /multimedia/tsvorzwanzigjahren-472.html - // - /inland/tsvorzwanzigjahren-ts-100.html - final Elements links = aDocument.select("a[href*='tsvorzwanzigjahren']"); - - // Pattern to validate and capture the numeric id or year slug at the end - final Pattern p = Pattern.compile(".*/(tsvorzwanzigjahren(?:-ts)?-?\\d+)\\.html$"); - - for (final Element link : links) { - try { - final String href = link.attr("href"); - if (href == null || href.isEmpty()) { - continue; - } - // normalize to absolute - final String fullUrl = href.startsWith("http") ? href : "https://www.tagesschau.de" + (href.startsWith("/") ? "" : "/") + href; - - final Matcher m = p.matcher(fullUrl); - if (m.find() && Arrays.stream(BLACKLIST).noneMatch(fullUrl::equalsIgnoreCase)) { - // Add the URL (deduplication is handled by the Set in taskResults) - taskResults.add(new CrawlerUrlDTO(fullUrl)); - crawler.incrementAndGetActualCount(); - } - } catch (final Exception e) { - LOG.debug("Error while processing overview link", e); - crawler.incrementAndGetErrorCount(); - } - } - } - - @Override - protected AbstractRecursiveConverterTask createNewOwnInstance( - final Queue aElementsToProcess) { - return new TagesschauOverviewTask(crawler, aElementsToProcess); - } -} - - - - diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTask.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTask.java index 78b9478ec..f943d90b6 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauVideoTask.java @@ -43,6 +43,7 @@ protected void processDocument(CrawlerUrlDTO aUrlDTO, Document aDocument) { taskResults.addAll(films); crawler.incrementAndGetActualCount(); } catch (Exception e) { + LOG.error(e); crawler.incrementAndGetErrorCount(); } }); diff --git a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java index b496466b2..44795eeb3 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializerTest.java @@ -49,7 +49,18 @@ void test() { ""); } - protected TagesschauCrawler createCrawler() { + @Test + void testWrongYear() { + + final JsonElement jsonElement = + JsonFileReader.readJson("/tagesschau/tagesschau_20jahre_video_date.json"); + TagesschauVideoDeserializer target = new TagesschauVideoDeserializer(createCrawler()); + final List actual = target.deserialize(jsonElement, null, null); + assertNotNull(actual); + assertEquals(1, actual.size()); + assertEquals(LocalDateTime.of(1996, 7, 23, 22, 45, 0), actual.getFirst().getTime()); +} + protected TagesschauCrawler createCrawler() { final ForkJoinPool forkJoinPool = new ForkJoinPool(); final Collection nachrichten = new ArrayList<>(); final Collection fortschritte = new ArrayList<>(); diff --git a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTaskTest.java b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTaskTest.java index 8fa80a3be..51690e011 100644 --- a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTaskTest.java +++ b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTaskTest.java @@ -1,10 +1,11 @@ package de.mediathekview.mserver.crawler.tagesschau.tasks; import static org.hamcrest.MatcherAssert.assertThat; -import static org.junit.jupiter.api.Assertions.*; +import static org.hamcrest.Matchers.equalTo; import de.mediathekview.mserver.base.webaccess.JsoupConnection; import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; +import de.mediathekview.mserver.crawler.tagesschau.EntryUrlDto; import de.mediathekview.mserver.crawler.tagesschau.TagesschauCrawler; import de.mediathekview.mserver.testhelper.JsoupMock; import java.util.Set; @@ -45,15 +46,84 @@ void testMonth() { jsoupConnection = JsoupMock.mock( requestUrl, "/tagesschau/tagesschau_20jahre_month.html"); + final Set actual = executeTask(requestUrl); + assertThat(actual.size(), equalTo(1)); + final EntryUrlDto actualEntry = actual.iterator().next(); + assertThat(actualEntry.getSubPages().size(), equalTo(0)); + assertThat(actualEntry.getVideos().size(), equalTo(expectedUrls.length)); + assertThat(actualEntry.getVideos(), Matchers.containsInAnyOrder(expectedUrls)); + } + + + @Test + void testYear() { + final String requestUrl = "http://tagesschau-year.de"; + final CrawlerUrlDTO[] expectedUrls = + new CrawlerUrlDTO[] { + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-476.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-474.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-468.html") + }; + + jsoupConnection = + JsoupMock.mock( + requestUrl, "/tagesschau/tagesschau_20jahre_year.html"); + + final Set actual = executeTask(requestUrl); + assertThat(actual.size(), equalTo(1)); + final EntryUrlDto actualEntry = actual.iterator().next(); + assertThat(actualEntry.getSubPages().size(), equalTo(expectedUrls.length)); + assertThat(actualEntry.getVideos().size(), equalTo(0)); + assertThat(actualEntry.getSubPages(), Matchers.containsInAnyOrder(expectedUrls)); + } + + @Test + void testOverview() { + final String requestUrl = "http://tagesschau-overview.de"; + final CrawlerUrlDTO[] expectedUrls = + new CrawlerUrlDTO[] { + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-478.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-472.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-442.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-416.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-387.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-359.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-327.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-301.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-257.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-221.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-183.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-147.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-121.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-136.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-116.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-100.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-106.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-104.html"), + new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-102.html") + }; + + jsoupConnection = + JsoupMock.mock( + requestUrl, "/tagesschau/tagesschau_20jahre_overview.html"); + + final Set actual = executeTask(requestUrl); + assertThat(actual.size(), equalTo(1)); + final EntryUrlDto actualEntry = actual.iterator().next(); + assertThat(actualEntry.getSubPages().size(), equalTo(expectedUrls.length)); + assertThat(actualEntry.getVideos().size(), equalTo(0)); + assertThat(actualEntry.getSubPages(), Matchers.containsInAnyOrder(expectedUrls)); + } + + private Set executeTask(String requestUrl) { final TagesschauCrawler crawler = createCrawler(); crawler.setConnection(jsoupConnection); final ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(); queue.add(new CrawlerUrlDTO(requestUrl)); - final TagesschauEnriesTask target = new TagesschauEnriesTask(crawler, queue); - final Set actual = target.invoke(); - assertEquals(expectedUrls.length, actual.size()); - assertThat(actual, Matchers.containsInAnyOrder(expectedUrls)); + final TagesschauEntriesTask target = new TagesschauEntriesTask(crawler, queue); + return target.invoke(); } + } diff --git a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTaskTest.java b/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTaskTest.java deleted file mode 100644 index 1ff8954de..000000000 --- a/src/test/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauOverviewTaskTest.java +++ /dev/null @@ -1,96 +0,0 @@ -package de.mediathekview.mserver.crawler.tagesschau.tasks; - -import static org.hamcrest.MatcherAssert.assertThat; -import static org.junit.jupiter.api.Assertions.*; -import static org.mockito.Mockito.*; - -import de.mediathekview.mserver.base.webaccess.JsoupConnection; -import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO; -import de.mediathekview.mserver.crawler.tagesschau.TagesschauCrawler; -import de.mediathekview.mserver.testhelper.JsoupMock; -import java.util.Set; -import java.util.concurrent.ConcurrentLinkedQueue; -import org.hamcrest.Matchers; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.extension.ExtendWith; -import org.mockito.Mock; -import org.mockito.MockitoAnnotations; -import org.mockito.junit.jupiter.MockitoExtension; - -@ExtendWith(MockitoExtension.class) -class TagesschauOverviewTaskTest extends TagesschauTaskTestBase { - - @Mock JsoupConnection jsoupConnection; - - @BeforeEach - void setUp() { - MockitoAnnotations.openMocks(this); - } - - @Test - void test() { - final String requestUrl = "http://tagesschau-overview.de"; - final CrawlerUrlDTO[] expectedUrls = - new CrawlerUrlDTO[] { - new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-478.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-472.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-442.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-416.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-387.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-359.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-327.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-301.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-257.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-221.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-183.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-147.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-121.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-136.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-116.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-100.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-106.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-104.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-102.html") - }; - - jsoupConnection = - JsoupMock.mock( - requestUrl, "/tagesschau/tagesschau_20jahre_overview.html"); - final TagesschauCrawler crawler = createCrawler(); - crawler.setConnection(jsoupConnection); - - final ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(); - queue.add(new CrawlerUrlDTO(requestUrl)); - - final TagesschauOverviewTask target = new TagesschauOverviewTask(crawler, queue); - final Set actual = target.invoke(); - assertEquals(expectedUrls.length, actual.size()); - assertThat(actual, Matchers.containsInAnyOrder(expectedUrls)); - } - - @Test - void testYear() { - final String requestUrl = "http://tagesschau-year.de"; - final CrawlerUrlDTO[] expectedUrls = - new CrawlerUrlDTO[] { - new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-476.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-474.html"), - new CrawlerUrlDTO("https://www.tagesschau.de/multimedia/tsvorzwanzigjahren-468.html") - }; - - jsoupConnection = - JsoupMock.mock( - requestUrl, "/tagesschau/tagesschau_20jahre_year.html"); - final TagesschauCrawler crawler = createCrawler(); - crawler.setConnection(jsoupConnection); - - final ConcurrentLinkedQueue queue = new ConcurrentLinkedQueue<>(); - queue.add(new CrawlerUrlDTO(requestUrl)); - - final TagesschauOverviewTask target = new TagesschauOverviewTask(crawler, queue); - final Set actual = target.invoke(); - assertEquals(expectedUrls.length, actual.size()); - assertThat(actual, Matchers.containsInAnyOrder(expectedUrls)); - } -} diff --git a/src/test/resources/tagesschau/tagesschau_20jahre_video_date.json b/src/test/resources/tagesschau/tagesschau_20jahre_video_date.json new file mode 100644 index 000000000..81a940463 --- /dev/null +++ b/src/test/resources/tagesschau/tagesschau_20jahre_video_date.json @@ -0,0 +1,371 @@ +{ + "pc":{ + "web":{ + "baseUrl":"/resources/assets/js/vendor/ardplayer/", + "isForcedAutoPlay":false, + "isEnablePostMessage":false, + "isForcedVideoView":true, + "disableBackButtonTitle":true, + "disablePosterTitle":true, + "disablePosterImage":true + }, + "generic":{ + "imageTemplateConfig":{ + "size":[ + { + "minWidth":0, + "value":"AAABnSSvsdE/16x9-small" + }, + { + "minWidth":568, + "value":"AAABnSSvrFg/16x9-big" + } + ], + "width":{ + "min":320, + "max":1920, + "stepSize":320 + } + }, + "isAutoplay":false + }, + "pluginData":{ + + } + }, + "mc":{ + "streams":[ + { + "media":[ + { + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webl.h264.mp4", + "mimeType":"video/mp4", + "fallbackGroup":4, + "maxHResolutionPx":960, + "audios":[ + { + "kind":"standard", + "languageCode":"de" + } + ] + }, + { + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webs.h264.mp4", + "mimeType":"video/mp4", + "fallbackGroup":9, + "maxHResolutionPx":480, + "audios":[ + { + "kind":"standard", + "languageCode":"de" + } + ] + }, + { + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.websm.h264.mp4", + "mimeType":"video/mp4", + "audios":[ + { + "kind":"standard", + "languageCode":"de" + } + ] + }, + { + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webm.h264.mp4", + "mimeType":"video/mp4", + "fallbackGroup":6, + "maxHResolutionPx":640, + "audios":[ + { + "kind":"standard", + "languageCode":"de" + } + ] + }, + { + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webml.h264.mp4", + "mimeType":"video/mp4", + "fallbackGroup":8, + "maxHResolutionPx":512, + "audios":[ + { + "kind":"standard", + "languageCode":"de" + } + ] + }, + { + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.weblp.h264.mp4", + "mimeType":"video/mp4", + "fallbackGroup":4, + "maxHResolutionPx":960, + "audios":[ + { + "kind":"standard", + "languageCode":"de" + } + ] + }, + { + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webxl.h264.mp4", + "mimeType":"video/mp4", + "fallbackGroup":2, + "maxHResolutionPx":1280, + "audios":[ + { + "kind":"standard", + "languageCode":"de" + } + ] + }, + { + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webm.webm", + "mimeType":"video/mp4", + "audios":[ + { + "kind":"standard", + "languageCode":"de" + } + ] + }, + { + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webl.webm", + "mimeType":"video/mp4", + "audios":[ + { + "kind":"standard", + "languageCode":"de" + } + ] + }, + { + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.h264.mp4", + "mimeType":"video/mp4", + "audios":[ + { + "kind":"standard", + "languageCode":"de" + } + ] + }, + { + "url":"https://adaptive.tagesschau.de/i/video/2016/0721/TV-20160721-1203-0101,.h264.mp4,.webml.h264.mp4,.weblp.h264.mp4,.webs.h264.mp4,.webl.h264.mp4,.webxl.h264.mp4,.webm.h264.mp4,.csmil/master.m3u8", + "mimeType":"application/vnd.apple.mpegurl", + "audios":[ + { + "kind":"standard", + "languageCode":"de" + } + ], + "isAdaptiveQualitySelectable":true + } + ], + "kind":"main", + "isAudioOnly":false + } + ], + "meta":{ + "images":[ + { + "url":"https://images.tagesschau.de/image/72ef28e9-ed07-436b-a2cf-101df86a10b2/AAABliKBGks/{size}/sendungsbild-186911.webp?width={width}", + "kind":"preview", + "title":"Sendungsbild | ARD-aktuell", + "alt":"Sendungsbild" + }, + { + "url":"https://images.tagesschau.de/image/72ef28e9-ed07-436b-a2cf-101df86a10b2/AAABliKBGks/{size}/sendungsbild-186911.jpg?width={width}", + "kind":"preview", + "title":"Sendungsbild | ARD-aktuell", + "alt":"Sendungsbild" + } + ], + "title":"tagesschau vor 20 Jahren, 23 Juli 1996", + "seriesTitle":"tagesschau vor 20 Jahren", + "durationSeconds":944, + "broadcastedOnDateTime":"2016-07-23T20:45:00+0000", + "showBroadcastedOnWithTime":true, + "synopsis":"23 Juli 1996" + }, + "pluginData":{ + "sharing@web":{ + "link":"https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-201449.html", + "body":"Meine Empfehlung: „tagesschau vor 20 Jahren, 23 Juli 1996” \nhttps://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-201449.html", + "subject":"Entdeckt auf tagesschau.de: tagesschau vor 20 Jahren, 23 Juli 1996", + "disableSubclipping":true, + "services":[ + "url", + "email", + "whatsapp", + "facebook", + "fb-messenger", + "twitter", + "telegram", + "threema", + "embed" + ], + "embedCode":"<iframe src=\"https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-201449~player.html$params$\" width=\"512\" height=\"288\" allowfullscreen frameBorder=\"0\" scrolling=\"no\"></iframe>", + "params":"?startTime=$start$&endTime=$ende$", + "legal":"Durch die Einbettung von ARD-Videos auf Ihrer Webseite stimmen Sie den <a href=\"https://www.ardmediathek.de/ard/nutzungsbedingungen\" target=\"_blank\">ARD Nutzungsbedingungen</a> zu.", + "embedDialogTitle":"Inhalt einbetten" + }, + "download@web":{ + "isEnabled":true, + "sources":[ + { + "title":"SD 540p", + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webl.h264.mp4?download=true" + }, + { + "title":"SD 270p", + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webs.h264.mp4?download=true" + }, + { + "title":"SD 360p", + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webm.h264.mp4?download=true" + }, + { + "title":"SD 360p", + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webml.h264.mp4?download=true" + }, + { + "title":"SD 540p", + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.weblp.h264.mp4?download=true" + }, + { + "title":"HD 720p", + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.webxl.h264.mp4?download=true" + }, + { + "title":"SD 540p", + "url":"https://tagesschau-progressive.ard-mcdn.de/video/2016/0721/TV-20160721-1203-0101.h264.mp4?download=true" + }, + { + "title":"SD 540p", + "url":"https://tagesschau-podcast.ard-mcdn.de/audio/2016/0721/TV-20160721-1203-0101.mp3?download=true" + } + ] + }, + "trackingPiano@all":{ + "config":{ + "dimensionTransform":{ + "av_autoplay":"av_auto_mode" + }, + "events":[ + "av.speed", + "av.quality", + "av.error", + "av.volume", + "av.volume.mute", + "av.share", + "av.dialog.open", + "av.dialog.close", + "av.playermode", + "av.jumpmark", + "av.subtitle", + "av.language", + "av.audiodescription", + "av.signlanguage", + "av.set.sleeptimer", + "av.playlist", + "av.embed.click", + "av.embed.title", + "av.recommendation" + ] + }, + "avContent":{ + "atiTagConfig":{ + "site":595936, + "collectDomain":"vqggwrz.pa-cd.com", + "addEventURL":"true" + }, + "s:tgp_page_chapter1":"multimedia", + "s:tgp_page_chapter2":"sendung", + "s:tgp_page_chapter3":"tagesschau_vor_20_jahren", + "s:technical_tagging_guide":"Unified", + "s:site_level2":"www.tagesschau.de", + "s:tgp_product_platform":"Web", + "b:product_login":true, + "content_authors":[ + "tagesschau" + ], + "s:page":"tagesschau vor 20 Jahren, 23 Juli 1996", + "b:tgp_page_contains_video":true, + "b:tgp_page_contains_audio":false, + "s:tgp_content_object_type":"Video", + "d:tgp_content_publication_time":"1996-07-23T20:00:00Z", + "s:tgp_content_external_id":"tagesschau_TV-20160721-1203-0101", + "s:tgp_content_id":"video-201449", + "d:tgp_content_last_editorial_update":"1996-07-23T20:00:00Z", + "s:tgp_content_creator":"tagesschau", + "s:tgp_content_creator_institution":"tagesschau", + "s:tgp_content_broadcast_station":"Das Erste", + "s:tgp_content_show":"tagesschau vor 20 Jahren", + "s:tgp_content_show_id":"tsv20", + "b:tgp_content_broadcast_reference":true, + "n:tgp_content_days_since_publication":10888, + "s:tgp_content_teaser_image_url":"https://images.tagesschau.de/image/72ef28e9-ed07-436b-a2cf-101df86a10b2/AAABliKBGks/AAABnR8VW9w/original/sendungsbild-186911.jpg", + "s:tgp_content_url_path":"/multimedia/sendung/tagesschau_vor_20_jahren/video-201449.html", + "b:tgp_content_author_visible":false, + "n:av_content_duration":944000, + "s:av_content":"tagesschau vor 20 Jahren, 23 Juli 1996", + "av_content_id":"video-201449", + "s:av_show":"tagesschau vor 20 Jahren", + "s:av_content_type":"Video", + "s:av_broadcasting_type":"OnDemand", + "d:av_original_air_time":"2016-07-23T22:45:00Z", + "b:av_web_only":true, + "b:av_full_show":false, + "s:av_player":"ARD Player", + "s:av_content_creator":"tagesschau", + "s:av_content_external_id":"tagesschau_TV-20160721-1203-0101", + "s:av_content_crid":"crid://tagesschau.de/tagesschau_TV-20160721-1203-0101", + "s:av_institution":"tagesschau", + "s:av_publisher":"Das Erste", + "s:av_show_id":"tsv20", + "s:av_content_url":"https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-201449.html", + "d:av_publication_time":"1996-07-23T20:00:00Z" + }, + "isEnabled":true + }, + "trackingAgf@all":{ + "appId":"PE6FF1BB7-FE88-4674-B083-2772ADAD55E9", + "playerID":"video-201449", + "clipData":{ + "type":"content", + "assetid":"video-201449_0", + "program":"tagesschau vor 20 Jahren", + "length":"944", + "title":"Das Erste_tagesschau vor 20 Jahren_tagesschau vor 20 Jahren, 23 Juli 1996_1996.07.23 20:00:00", + "nol_c0":"p0,0", + "nol_c2":"p2,N", + "nol_c5":"p5,https://www.tagesschau.de/multimedia/sendung/tagesschau_vor_20_jahren/video-201449.html", + "nol_c7":"p7,video-201449", + "nol_c9":"p9,tagesschau vor 20 Jahren_tagesschau vor 20 Jahren, 23 Juli 1996_1996.07.23 20:00:00", + "nol_c10":"p10,Das Erste", + "nol_c12":"p12,Content", + "nol_c16":"p16,ARD_Information", + "nol_c18":"p18,N" + }, + "sfcode":"eu", + "prod":"vc", + "apn":"ardplayer", + "agfMetaDataSDK":{ + "censuscategory":"Das Erste_tagesschau vor 20 Jahren_tagesschau vor 20 Jahren, 23 Juli 1996_1996.07.23 20:00:00", + "livestream":"no" + } + } + } + }, + "mediadescription":"tagesschau vor 20 Jahren, 23 Juli 1996", + "playerType":"video", + "ratio":"16x9", + "context":"avdetail", + "posterImage":{ + "altText":"Sendungsbild", + "title":"Sendungsbild | ARD-aktuell", + "urlS":"https://images.tagesschau.de/image/72ef28e9-ed07-436b-a2cf-101df86a10b2/AAABliKBGks/AAABnSSvrFg/16x9-big/sendungsbild-186911.webp?width=640", + "urlM":"https://images.tagesschau.de/image/72ef28e9-ed07-436b-a2cf-101df86a10b2/AAABliKBGks/AAABnSSvrFg/16x9-big/sendungsbild-186911.webp?width=768", + "urlL":"https://images.tagesschau.de/image/72ef28e9-ed07-436b-a2cf-101df86a10b2/AAABliKBGks/AAABnSSvrFg/16x9-big/sendungsbild-186911.webp?width=1280" + } +} \ No newline at end of file From 4b12657584b122e013066e17d061bdbd56049426 Mon Sep 17 00:00:00 2001 From: pidoubleyou <22942659+pidoubleyou@users.noreply.github.com> Date: Wed, 20 May 2026 21:20:51 +0200 Subject: [PATCH 04/10] fixes --- .../mserver/crawler/tagesschau/TagesschauConstants.java | 9 --------- .../mserver/crawler/tagesschau/TagesschauCrawler.java | 7 ++++++- .../tagesschau/json/TagesschauVideoDeserializer.java | 4 ++-- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauConstants.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauConstants.java index 9bd3f3b0c..237014d64 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauConstants.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauConstants.java @@ -9,15 +9,6 @@ public final class TagesschauConstants { // Starting point: Tagesschau vor 20 Jahren (20 years ago) public static final String ARCHIVE_START_URL = "https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-142.html"; - // Pattern for accessing specific month archives - // Example: /multimedia/sendung/ts/vor20jahren/ts-vor20jahren-20060401.html - public static final String ARCHIVE_DAY_URL_PATTERN = "https://www.tagesschau.de/multimedia/sendung/ts/vor20jahren/ts-vor20jahren-%s.html"; - - // Base URL for archive pages - public static final String ARCHIVE_MONTH_BASE = "https://www.tagesschau.de/multimedia/sendung/ts/vor20jahren/"; - - public static final String VIDEO_JSON = "https://zagent7.h-cdn.com/cmd/get_links_info?customer=ard_de&zone=gen&ver=1.165.211&url=https%3A%2F%2Fwww.tagesschau.de%2Fmultimedia%2Fsendung%2Ftagesschau_vor_20_jahren%2Fvideo-%s.html"; - // Private constructor to hide the implicit public one private TagesschauConstants() { // Utility class, do not instantiate diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java index 4aac38fa4..d37230a08 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java @@ -79,7 +79,12 @@ protected RecursiveTask> createCrawlerTask() { } catch (final ExecutionException ex) { LOG.fatal("Exception in Tagesschau crawler.", ex); } - return null; + return new RecursiveTask>() { + @Override + protected Set compute() { + return Set.of(); + } + }; } private Queue createArchiveUrl() { diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java index e9c512d07..a22eb67a5 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java @@ -54,7 +54,7 @@ public class TagesschauVideoDeserializer implements JsonDeserializer> .appendPattern("d MMMM uuuu") .toFormatter(Locale.GERMAN); private static final DateTimeFormatter DATE_TIME_FORMATTER = - DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ssZ", Locale.GERMANY); // 2016-10-29T16:15:00+02:00 + DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ssZ", Locale.GERMANY); private static final String GERMAN_TIME_ZONE = "Europe/Berlin"; private static final Logger LOG = LogManager.getLogger(TagesschauVideoDeserializer.class); private final AbstractCrawler crawler; @@ -75,7 +75,7 @@ private static Optional parseDate(final JsonObject metaObject, fi } return Optional.of(localDateTime); } catch (final DateTimeParseException ex) { - LOG.error("Error parsing date time value {}", dateValue.get(), ex); + LOG.warn("Error parsing date time value {}", dateValue.get(), ex); } } From 574cb1bc60fd283ad75a9b641dd5e16a71b95122 Mon Sep 17 00:00:00 2001 From: pidoubleyou <22942659+pidoubleyou@users.noreply.github.com> Date: Wed, 20 May 2026 21:30:40 +0200 Subject: [PATCH 05/10] fix sonar --- .../mediathekview/mserver/testhelper/WireMockTestBase.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java b/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java index e30d60878..4246d430c 100644 --- a/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java +++ b/src/test/java/de/mediathekview/mserver/testhelper/WireMockTestBase.java @@ -9,10 +9,7 @@ import org.junit.Before; import jakarta.annotation.Nullable; -import org.junit.BeforeClass; -import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import java.util.Optional; @@ -24,7 +21,7 @@ /** base class of tests with WireMock. */ public abstract class WireMockTestBase { - private final Logger LOG = LogManager.getLogger(WireMockTestBase.class); + private static final Logger LOG = LogManager.getLogger(WireMockTestBase.class); protected WireMockServer wireMockServer = new WireMockServer(options().dynamicPort()); private boolean wireMockStarted = false; From 3cff5954a1ddb31f9e324ceeeca40d43a71ccb13 Mon Sep 17 00:00:00 2001 From: pidoubleyou <22942659+pidoubleyou@users.noreply.github.com> Date: Wed, 20 May 2026 21:36:23 +0200 Subject: [PATCH 06/10] fix sonar --- .../crawler/tagesschau/tasks/TagesschauEntriesTask.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTask.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTask.java index 9264c8428..9c895b446 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTask.java @@ -17,6 +17,7 @@ import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; +import org.jspecify.annotations.NonNull; public class TagesschauEntriesTask extends AbstractDocumentTask { private static final Logger LOG = LogManager.getLogger(TagesschauEntriesTask.class); @@ -43,7 +44,7 @@ protected void processDocument(CrawlerUrlDTO aUrlDTO, Document aDocument) { continue; } // normalize to absolute - final String fullUrl = href.startsWith("http") ? href : "https://www.tagesschau.de" + (href.startsWith("/") ? "" : "/") + href; + final String fullUrl = href.startsWith("http") ? href : buildUrl(href); if (Arrays.stream(BLACKLIST).noneMatch(fullUrl::equalsIgnoreCase)) { final Matcher matcherSubPage = PATTERN_SUB_PAGE.matcher(fullUrl); @@ -62,6 +63,10 @@ protected void processDocument(CrawlerUrlDTO aUrlDTO, Document aDocument) { taskResults.add(result); } + private static @NonNull String buildUrl(String href) { + return "https://www.tagesschau.de" + (href.startsWith("/") ? "" : "/") + href; + } + @Override protected AbstractRecursiveConverterTask createNewOwnInstance( Queue aElementsToProcess) { From 8489e9845d45cda19de9215c07c64cb6d6e4071c Mon Sep 17 00:00:00 2001 From: pidoubleyou <22942659+pidoubleyou@users.noreply.github.com> Date: Wed, 20 May 2026 21:49:33 +0200 Subject: [PATCH 07/10] filter videos --- .../mserver/crawler/tagesschau/TagesschauCrawler.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java index d37230a08..611ef243a 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java @@ -64,14 +64,15 @@ protected RecursiveTask> createCrawlerTask() { recursionCount++; } - getAndSetMaxCount(videos.size()); + final Queue videosFiltered = this.filterExistingFilms(videos, CrawlerUrlDTO::getUrl); + getAndSetMaxCount(videosFiltered.size()); printMessage( ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT, getSender().getName(), - videos.size()); + videosFiltered.size()); - return new TagesschauVideoTask(this, new ConcurrentLinkedQueue<>(videos)); + return new TagesschauVideoTask(this, new ConcurrentLinkedQueue<>(videosFiltered)); } catch (final InterruptedException ex) { LOG.fatal("Exception in Tagesschau crawler.", ex); From 849abe5f61b3e92a1494e5a53ea521aa070f7858 Mon Sep 17 00:00:00 2001 From: pidoubleyou <22942659+pidoubleyou@users.noreply.github.com> Date: Wed, 20 May 2026 21:50:05 +0200 Subject: [PATCH 08/10] fix sonar --- .../crawler/tagesschau/tasks/TagesschauEntriesTask.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTask.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTask.java index 9c895b446..71755f2d6 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTask.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/tasks/TagesschauEntriesTask.java @@ -22,8 +22,8 @@ public class TagesschauEntriesTask extends AbstractDocumentTask { private static final Logger LOG = LogManager.getLogger(TagesschauEntriesTask.class); - private static final Pattern PATTERN_VIDEO = Pattern.compile(".*/video-\\d+\\.html$"); - private static final Pattern PATTERN_SUB_PAGE = Pattern.compile(".*/(tsvorzwanzigjahren(?:-ts)?-?\\d+)\\.html$"); + private static final Pattern PATTERN_VIDEO = Pattern.compile("/video-\\d+\\.html$"); + private static final Pattern PATTERN_SUB_PAGE = Pattern.compile("/(tsvorzwanzigjahren(?:-ts)?-?\\d+)\\.html$"); private static final String[] BLACKLIST = new String[] {TagesschauConstants.ARCHIVE_START_URL}; public TagesschauEntriesTask(final AbstractCrawler crawler, final Queue queue) { From ebbeda9eee00077406191686159a44486ba47fab Mon Sep 17 00:00:00 2001 From: pidoubleyou <22942659+pidoubleyou@users.noreply.github.com> Date: Wed, 20 May 2026 21:51:12 +0200 Subject: [PATCH 09/10] change log level --- .../crawler/tagesschau/json/TagesschauVideoDeserializer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java index a22eb67a5..b8599e1d1 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java @@ -149,7 +149,7 @@ private Optional parseDateFromTitle(String title) { try { return Optional.of(LocalDate.parse(datePart, GERMAN_LONG_NO_DOT)); } catch (DateTimeParseException ex) { - LOG.warn("no valid date converted", ex); + LOG.debug("no valid date converted", ex); } } return Optional.empty(); From f142393d9a06c870cce6d66fc59d30f997784a2f Mon Sep 17 00:00:00 2001 From: pidoubleyou <22942659+pidoubleyou@users.noreply.github.com> Date: Wed, 20 May 2026 21:51:56 +0200 Subject: [PATCH 10/10] fix warning --- .../mserver/crawler/tagesschau/TagesschauCrawler.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java index 611ef243a..9b7771ccf 100644 --- a/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java +++ b/src/main/java/de/mediathekview/mserver/crawler/tagesschau/TagesschauCrawler.java @@ -80,7 +80,7 @@ protected RecursiveTask> createCrawlerTask() { } catch (final ExecutionException ex) { LOG.fatal("Exception in Tagesschau crawler.", ex); } - return new RecursiveTask>() { + return new RecursiveTask<>() { @Override protected Set compute() { return Set.of();