Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion MServer-Config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ maximumRequestsPerSecond: 999.0

# If set only these Sender will be crawled all other will be ignored.
senderIncluded:
- ARD
# - ARD
#- ARTE_DE
#- ARTE_FR
#- ARTE_PL
Expand All @@ -41,6 +41,7 @@ senderIncluded:
#- PHOENIX
#- SRF
#- SR
- TAGESSCHAU24
#- ZDF

#SRF,SR,PHONIX,ORF,KIKA,DW,3SAT<
Expand Down
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,12 @@
<version>${mockito.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<version>${mockito.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.hamcrest</groupId>
<artifactId>hamcrest</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package de.mediathekview.mserver.crawler;

import de.mediathekview.mserver.crawler.tagesschau.TagesschauCrawler;
import de.mediathekview.mserver.daten.Film;
import de.mediathekview.mserver.daten.Filmlist;
import de.mediathekview.mserver.daten.Sender;
Expand Down Expand Up @@ -547,6 +548,8 @@ private void initializeCrawler(final MServerConfigManager rootConfig) {
Sender.SRF, new SrfCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig));
crawlerMap.put(
Sender.SR, new SrCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig));
crawlerMap.put(
Sender.TAGESSCHAU24, new TagesschauCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig));
crawlerMap.put(
Sender.ZDF, new ZdfCrawler(forkJoinPool, messageListeners, progressListeners, rootConfig));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package de.mediathekview.mserver.crawler.tagesschau;

import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;

import java.util.HashSet;
import java.util.Set;

public class EntryUrlDto {
private final Set<CrawlerUrlDTO> videos;
private final Set<CrawlerUrlDTO> subPages;

public EntryUrlDto() {
this.videos = new HashSet<>();
this.subPages = new HashSet<>();
}

public Set<CrawlerUrlDTO> getVideos() {
return videos;
}

public Set<CrawlerUrlDTO> getSubPages() {
return subPages;
}

public void addVideo(CrawlerUrlDTO videoUrl) {
this.videos.add(videoUrl);
}

public void addSubPage(CrawlerUrlDTO subPageUrl) {
this.subPages.add(subPageUrl);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package de.mediathekview.mserver.crawler.tagesschau;

/**
* Constants for the Tagesschau crawler.
* Handles the "vor 20 Jahren" (20 years ago) archive with daily news broadcasts.
*/
public final class TagesschauConstants {

// Starting point: Tagesschau vor 20 Jahren (20 years ago)
public static final String ARCHIVE_START_URL = "https://www.tagesschau.de/inland/tsvorzwanzigjahren-ts-142.html";

// Private constructor to hide the implicit public one
private TagesschauConstants() {
// Utility class, do not instantiate
}
}


Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package de.mediathekview.mserver.crawler.tagesschau;

import de.mediathekview.mserver.crawler.tagesschau.tasks.TagesschauEntriesTask;
import de.mediathekview.mserver.crawler.tagesschau.tasks.TagesschauVideoTask;
import de.mediathekview.mserver.daten.Film;
import de.mediathekview.mserver.daten.Sender;
import de.mediathekview.mserver.base.messages.listener.MessageListener;
import de.mediathekview.mserver.base.config.MServerConfigManager;
import de.mediathekview.mserver.base.messages.ServerMessages;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.crawler.basic.CrawlerUrlDTO;
import de.mediathekview.mserver.progress.listeners.SenderProgressListener;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.util.Collection;
import java.util.HashSet;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.RecursiveTask;

public class TagesschauCrawler extends AbstractCrawler {

private static final Logger LOG = LogManager.getLogger(TagesschauCrawler.class);

public TagesschauCrawler(
final ForkJoinPool aForkJoinPool,
final Collection<MessageListener> aMessageListeners,
final Collection<SenderProgressListener> aProgressListeners,
final MServerConfigManager rootConfig) {
super(aForkJoinPool, aMessageListeners, aProgressListeners, rootConfig);
}

@Override
public Sender getSender() {
return Sender.TAGESSCHAU24;
}

@Override
protected RecursiveTask<Set<Film>> createCrawlerTask() {
try {
Set<CrawlerUrlDTO> videos = new HashSet<>();
Queue<CrawlerUrlDTO> inputQueue = createArchiveUrl();

// short run uses 2 recursion -> only the actual month is included
int recursionMax = Boolean.TRUE.equals(crawlerConfig.getTopicsSearchEnabled()) ? 10 : 2;
int recursionCount = 0;

while (!inputQueue.isEmpty() && recursionCount < recursionMax) {
LOG.debug("processing {} sub pages", inputQueue.size());
TagesschauEntriesTask round1 = new TagesschauEntriesTask(this, inputQueue);
final Set<EntryUrlDto> results = this.forkJoinPool.submit(round1).get();

Set<CrawlerUrlDTO> subPages = new HashSet<>();
results.forEach(
result -> {
videos.addAll(result.getVideos());
subPages.addAll(result.getSubPages());
});
inputQueue = new ConcurrentLinkedQueue<>(subPages);
recursionCount++;
}

final Queue<CrawlerUrlDTO> videosFiltered = this.filterExistingFilms(videos, CrawlerUrlDTO::getUrl);
getAndSetMaxCount(videosFiltered.size());

printMessage(
ServerMessages.DEBUG_ALL_SENDUNG_FOLGEN_COUNT,
getSender().getName(),
videosFiltered.size());

return new TagesschauVideoTask(this, new ConcurrentLinkedQueue<>(videosFiltered));

} catch (final InterruptedException ex) {
LOG.fatal("Exception in Tagesschau crawler.", ex);
Thread.currentThread().interrupt();
} catch (final ExecutionException ex) {
LOG.fatal("Exception in Tagesschau crawler.", ex);
}
return new RecursiveTask<>() {
@Override
protected Set<Film> compute() {
return Set.of();
}
};
}

private Queue<CrawlerUrlDTO> createArchiveUrl() {
Queue<CrawlerUrlDTO> urls = new ConcurrentLinkedQueue<>();
urls.add(new CrawlerUrlDTO(TagesschauConstants.ARCHIVE_START_URL));
return urls;
}
}


Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
package de.mediathekview.mserver.crawler.tagesschau.json;

import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import de.mediathekview.mserver.base.utils.JsonUtils;
import de.mediathekview.mserver.crawler.basic.AbstractCrawler;
import de.mediathekview.mserver.daten.*;
import java.lang.reflect.Type;
import java.net.MalformedURLException;
import java.net.URI;
import java.time.*;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeFormatterBuilder;
import java.time.format.DateTimeParseException;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

public class TagesschauVideoDeserializer implements JsonDeserializer<List<Film>> {
private static final String ELEMENT_MC = "mc";
private static final String ELEMENT_MEDIA = "media";
private static final String ELEMENT_META = "meta";
private static final String ELEMENT_PLUG_IN_DATA = "pluginData";
private static final String ELEMENT_SHARING_WEB = "sharing@web";
private static final String ELEMENT_STREAMS = "streams";

private static final String ATTRIBUTE_DATE = "broadcastedOnDateTime";
private static final String ATTRIBUTE_DURATION = "durationSeconds";
private static final String ATTRIBUTE_TOPIC = "seriesTitle";
private static final String ATTRIBUTE_TITLE = "title";

private static final String ATTRIBUTE_WIDTH = "maxHResolutionPx";
private static final String ATTRIBUTE_MIMETYPE = "mimeType";
private static final String ATTRIBUTE_URL = "url";
private static final String ATTRIBUTE_LINK = "link";
private static final String[] SUPPORTED_MIME_TYPES = new String[] { "video/mp4" };

private static final Pattern LONG_MONTH_PATTERN =
Pattern.compile("(\\d{1,2}(?:\\.|\\s)\\s*[A-Za-zÄÖÜäöüß]+\\s+\\d{4})");

Check warning on line 43 in src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Replace this alternation with a character class.

See more on https://sonarcloud.io/project/issues?id=mediathekview_MServer&issues=AZ5G0fjlx9JjBUJrD3Rh&open=AZ5G0fjlx9JjBUJrD3Rh&pullRequest=1146
private static final DateTimeFormatter GERMAN_LONG = new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.appendPattern("d. MMMM uuuu")
.toFormatter(Locale.GERMAN);
private static final DateTimeFormatter GERMAN_LONG_NO_SPACE = new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.appendPattern("d.MMMM uuuu")
.toFormatter(Locale.GERMAN);
private static final DateTimeFormatter GERMAN_LONG_NO_DOT = new DateTimeFormatterBuilder()
.parseCaseInsensitive()
.appendPattern("d MMMM uuuu")
.toFormatter(Locale.GERMAN);
private static final DateTimeFormatter DATE_TIME_FORMATTER =
DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ssZ", Locale.GERMANY);
private static final String GERMAN_TIME_ZONE = "Europe/Berlin";
private static final Logger LOG = LogManager.getLogger(TagesschauVideoDeserializer.class);
private final AbstractCrawler crawler;

public TagesschauVideoDeserializer(AbstractCrawler crawler) {
this.crawler = crawler;
}

private static Optional<LocalDateTime> parseDate(final JsonObject metaObject, final Optional<LocalDate> titleDate) {
final Optional<String> dateValue =
JsonUtils.getAttributeAsString(metaObject, ATTRIBUTE_DATE);
if (dateValue.isPresent()) {
try {
final OffsetDateTime inputDateTime = OffsetDateTime.parse(dateValue.get(), DATE_TIME_FORMATTER);
LocalDateTime localDateTime = inputDateTime.atZoneSameInstant(ZoneId.of(GERMAN_TIME_ZONE)).toLocalDateTime();
if (titleDate.isPresent() && titleDate.get().getYear() != localDateTime.getYear()) {
localDateTime = localDateTime.withYear(titleDate.get().getYear());
}
return Optional.of(localDateTime);
} catch (final DateTimeParseException ex) {
LOG.warn("Error parsing date time value {}", dateValue.get(), ex);
}
}

return titleDate.map(localDate -> LocalDateTime.of(localDate, LocalTime.of(20, 0)));
}

@Override
public List<Film> deserialize(

Check failure on line 86 in src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this method to reduce its Cognitive Complexity from 17 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=mediathekview_MServer&issues=AZ5G0fjmx9JjBUJrD3Ri&open=AZ5G0fjmx9JjBUJrD3Ri&pullRequest=1146
JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) {

final List<Film> results = new ArrayList<>();

Optional<JsonElement> mcElement = JsonUtils.getElement(jsonElement, ELEMENT_MC);
if (mcElement.isPresent()) {
final Optional<JsonElement> metaElement = JsonUtils.getElement(mcElement.get(), ELEMENT_META);
if (metaElement.isPresent()) {
final JsonObject metaObject = metaElement.get().getAsJsonObject();
final Optional<String> topic = JsonUtils.getAttributeAsString(metaObject, ATTRIBUTE_TOPIC);
final Optional<String> title = JsonUtils.getAttributeAsString(metaObject, ATTRIBUTE_TITLE);
final Optional<Integer> duration = JsonUtils.getAttributeAsInt(metaObject, ATTRIBUTE_DURATION);
final Optional<LocalDateTime> date = parseDate(metaObject, parseDateFromTitle(title.orElse("")));
final Map<Resolution, String> urls = parseUrls(mcElement.get().getAsJsonObject());
final String website = parseWebsite(mcElement.get().getAsJsonObject());

final Film film =
new Film(
UUID.randomUUID(),
Sender.TAGESSCHAU24,
title.orElse(""),
topic.orElse(""),
date.orElse(LocalDateTime.now()),
duration.isEmpty() ? Duration.ofSeconds(0) : Duration.ofSeconds(duration.get()));
film.addGeolocation(GeoLocations.GEO_NONE);
if (!website.isEmpty()) {
try {
film.setWebsite(URI.create(website).toURL());
} catch (MalformedURLException e) {
LOG.error("Invalid website URL: {}", website, e);
}
}

urls.forEach((resolution, url) -> {
try {
film.addUrl(resolution, new FilmUrl(url, crawler.determineFileSizeInKB(url)));
} catch (final MalformedURLException ex) {
LOG.error("InvalidUrl: {}", url, ex);
}
});

results.add(film);
}
}

return results;
}

private Optional<LocalDate> parseDateFromTitle(String title) {
Matcher m = LONG_MONTH_PATTERN.matcher(title);
if (m.find()) {
String datePart = m.group(1).replaceAll("\\s+", " ").trim();
try {
return Optional.of(LocalDate.parse(datePart, GERMAN_LONG));
} catch (DateTimeParseException ignored) {
// try other conversion
}
try {
return Optional.of(LocalDate.parse(datePart, GERMAN_LONG_NO_SPACE));
} catch (DateTimeParseException ignored) {
// try other conversion
}
try {
return Optional.of(LocalDate.parse(datePart, GERMAN_LONG_NO_DOT));
} catch (DateTimeParseException ex) {
LOG.debug("no valid date converted", ex);
}
}
return Optional.empty();
}

private String parseWebsite(JsonObject mcObject) {
return JsonUtils.getElementValueAsString(mcObject, ELEMENT_PLUG_IN_DATA, ELEMENT_SHARING_WEB, ATTRIBUTE_LINK).orElse("");
}

private Map<Resolution, String> parseUrls(final JsonObject mcObject) {

Check failure on line 162 in src/main/java/de/mediathekview/mserver/crawler/tagesschau/json/TagesschauVideoDeserializer.java

View check run for this annotation

SonarQubeCloud / SonarCloud Code Analysis

Refactor this method to reduce its Cognitive Complexity from 19 to the 15 allowed.

See more on https://sonarcloud.io/project/issues?id=mediathekview_MServer&issues=AZ5G0fjmx9JjBUJrD3Rj&open=AZ5G0fjmx9JjBUJrD3Rj&pullRequest=1146
final Map<Resolution, String> urls = new EnumMap<>(Resolution.class);
if (mcObject.has(ELEMENT_STREAMS) && mcObject.get(ELEMENT_STREAMS).isJsonArray()) {
mcObject
.get(ELEMENT_STREAMS)
.getAsJsonArray()
.forEach(
stream -> {
final JsonObject streamObject = stream.getAsJsonObject();
if (streamObject.has(ELEMENT_MEDIA)
&& streamObject.get(ELEMENT_MEDIA).isJsonArray()) {
streamObject
.get(ELEMENT_MEDIA)
.getAsJsonArray()
.forEach(
media -> {
final Optional<String> mimeType =
JsonUtils.getElementValueAsString(media, ATTRIBUTE_MIMETYPE);
if (mimeType.isPresent()
&& Arrays.stream(SUPPORTED_MIME_TYPES)
.anyMatch(type -> type.equals(mimeType.get()))) {
final Optional<Integer> width =
JsonUtils.getAttributeAsInt(
media.getAsJsonObject(), ATTRIBUTE_WIDTH);
final Optional<String> url =
JsonUtils.getElementValueAsString(media, ATTRIBUTE_URL);

if (width.isPresent() && url.isPresent()) {
final Resolution resolution =
Resolution.getResolutionFromWidth(width.get());
urls.put(resolution, url.get());
}
}
});
}
});
}
return urls;
}
}
Loading
Loading