From fec3e0b002377861363f6d1c32ef0d314ceb0d4d Mon Sep 17 00:00:00 2001 From: kalle Date: Mon, 27 Feb 2023 19:58:22 +0100 Subject: [PATCH 1/8] Started on documentation, added som helper scripts, made things more configurable, etc --- README.md | 51 ++++++++ register-wiki.sh | 9 ++ .../wikispeech/prerender/LocalCache.java | 1 - .../prerender/WebAppConfiguration.java | 16 ++- .../prerender/mediawiki/WikispeechApi.java | 4 +- .../prerender/rest/MainController.java | 56 ++------- .../prerender/service/CreateWikiCommand.java | 110 ++++++++++++++++++ .../service/MainPageLinksPrioritizer.java | 3 - .../service/RecentChangesService.java | 36 +++++- 9 files changed, 231 insertions(+), 55 deletions(-) create mode 100644 README.md create mode 100755 register-wiki.sh create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/CreateWikiCommand.java diff --git a/README.md b/README.md new file mode 100644 index 0000000..dc1b103 --- /dev/null +++ b/README.md @@ -0,0 +1,51 @@ +# Wikispeech-Prerender + +Starting from scratch will + +## REST + +Most REST calls are exist for debug and development reasons. +As a user of this service, all you need is ```POST /api/wiki``` + +### POST /api/wiki +* consumerUrl: Wiki to be monitored for pre rendered. +* initialLastRecentChangesLimitInMinutes: (default: 60) Number of hours of initial recent changes backlog to be processed. +* mainPagePriority: (default: 10) Base priority of segments on Wiki main page. +* maximumSynthesizedVoiceAgeInDays: (default: 30) Number of days before attempting to re-synthesizing segments on this Wiki. + +If initialLastRecentChangesLimitInHours is set to 0, then only new recent changes will be processed. + +Example: ```POST http://host:port/api/wiki?consumerUrl=https://sv.wikipedia.org/w``` + +### GET /api/synthesis/queue/candidates +* limit: (default 100) Maximum number of results +* startOffset: (default 0) Start offset for pagination + +Queue of Wiki page segments in line to be synthesized using a specific language and voice. + +### DELETE /api/synthesis/queue + +Clears queue of Wiki page segments in line to be synthesized. + +### GET /api/synthesis/errors +* limit: (default 100) Maximum number of results +* startOffset: (default 0) Start offset for pagination + +A list of errors that have occurred during synthesis of Wiki page segments. + +### GET /api/page +* consumerUrl: Wiki +* title: Wikie page title + +Example: ```GET http://host:port/api/page?consumerUrl=https://sv.wikipedia.org/w&title=Portal:Huvudsida``` + +Displays status and statistics about a given Wiki page. +* Priority +* Language +* Revision at segmentation +* Timestamp segmented +* Segments +* Voices synthesized +* Timestamp synthesized +* Synthesized revision +* etc \ No newline at end of file diff --git a/register-wiki.sh b/register-wiki.sh new file mode 100755 index 0000000..9b197b0 --- /dev/null +++ b/register-wiki.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +if [ $# -eq 0 ]; then + consumerUrl="https://sv.wikipedia.org/w" +else + consumerUrl=$1 +fi + +curl -d "consumerUrl=${consumerUrl}&initialLastRecentChangesLimitInMinutes=0&mainPagePriority=10&maximumSynthesizedVoiceAgeInDays=30" -X POST http://localhost:8080/api/wiki diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/LocalCache.java b/src/main/java/se/wikimedia/wikispeech/prerender/LocalCache.java index 3ad31f5..6adad93 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/LocalCache.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/LocalCache.java @@ -1,6 +1,5 @@ package se.wikimedia.wikispeech.prerender; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.Map; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java b/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java index c9981cb..a780ee5 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java @@ -11,14 +11,18 @@ import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.Configuration; +import org.springframework.scheduling.annotation.SchedulingConfigurer; +import org.springframework.scheduling.config.ScheduledTaskRegistrar; import java.io.IOException; +import java.util.concurrent.Executor; +import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; @Configuration @EnableAutoConfiguration(exclude = {DataSourceAutoConfiguration.class}) @ComponentScan(basePackages = "se.wikimedia.wikispeech.prerender") -public class WebAppConfiguration { +public class WebAppConfiguration implements SchedulingConfigurer { @Bean public OkHttpClient okHttpClient() { @@ -42,4 +46,14 @@ public Response intercept(@NotNull Chain chain) throws IOException { .build(); } + @Bean + public Executor taskExecutor() { + return Executors.newScheduledThreadPool(10); + } + + @Override + public void configureTasks(ScheduledTaskRegistrar taskRegistrar) { + taskRegistrar.setScheduler(taskExecutor()); + } + } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java index 71d8e57..9eb2a15 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java @@ -88,14 +88,14 @@ public Long getCurrentRevision(String consumerUrl, String title) throws IOExcept public static class MWException extends IOException { @Getter - private JsonNode error; + private final JsonNode error; public MWException(JsonNode error) { this.error = error; } public String getExceptionClass() { - return error.get("error").get("errorclass").textValue(); + return error.toString(); } } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java b/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java index 006b220..30d4d91 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java @@ -14,6 +14,7 @@ import se.wikimedia.wikispeech.prerender.LocalCache; import se.wikimedia.wikispeech.prerender.mediawiki.PageApi; import se.wikimedia.wikispeech.prerender.mediawiki.WikispeechApi; +import se.wikimedia.wikispeech.prerender.service.CreateWikiCommand; import se.wikimedia.wikispeech.prerender.service.PriorityService; import se.wikimedia.wikispeech.prerender.service.SegmentService; import se.wikimedia.wikispeech.prerender.service.SynthesizeService; @@ -23,10 +24,7 @@ import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegment; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegmentVoice; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; -import se.wikimedia.wikispeech.prerender.service.prevalence.query.GetWiki; import se.wikimedia.wikispeech.prerender.service.prevalence.query.PageSegmentVoiceReference; -import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.*; -import se.wikimedia.wikispeech.prerender.site.WikiResolver; import java.io.IOException; import java.time.Duration; @@ -68,44 +66,18 @@ public MainController( produces = "application/json" ) public ResponseEntity createWiki( - @RequestParam String consumerUrl + @RequestParam String consumerUrl, + @RequestParam(defaultValue = "60") Integer initialLastRecentChangesLimitInMinutes, + @RequestParam(defaultValue = "10") Float mainPagePriority, + @RequestParam(defaultValue = "30") Integer maximumSynthesizedVoiceAgeInDays ) throws Exception { - // todo assert correct consumerUrl. - Wiki wiki = prevalence.execute(new GetWiki(consumerUrl)); - if (wiki != null) { - return ResponseEntity.badRequest().body("{\"error\", \"Already exists\"}"); - } - WikiResolver resolver = new WikiResolver(); - resolver.detect(consumerUrl); - - PageApi.PageInfo pageInfo = pageApi.getPageInfo(consumerUrl, resolver.getMainPageTitle()); - - CreateWiki createWiki = new CreateWiki(); - createWiki.setConsumerUrl(consumerUrl); - createWiki.setName(resolver.getWikiName()); - createWiki.setDefaultLanguage(pageInfo.getPageLanguage()); - Set namespaces = new LinkedHashSet<>(); - namespaces.add(0); - namespaces.add(pageInfo.getNamespaceIdentity()); - createWiki.setPollRecentChangesNamespaces(new ArrayList<>(namespaces)); - createWiki.setTimestampOfLastRecentChangesItemProcessed(OffsetDateTime.now().minusDays(7)); - // todo request setting - createWiki.setMaximumSynthesizedVoiceAge(Duration.ofDays(30)); - // todo request setting - createWiki.setVoicesPerLanguage(resolver.getDefaultVoicesByLanguage()); - wiki = prevalence.execute(createWiki); - - CreateNonSegmentedPage createMainPage = new CreateNonSegmentedPage(); - createMainPage.setConsumerUrl(wiki.getConsumerUrl()); - createMainPage.setTitle(resolver.getMainPageTitle()); - createMainPage.setLanguageAtSegmentation(pageInfo.getPageLanguage()); - createMainPage.setRevisionAtSegmentation(pageInfo.getLastRevisionIdentity()); - createMainPage.setPriority(10F); - Page mainPage = prevalence.execute(createMainPage); - - prevalence.execute(new SetWikiMainPage(consumerUrl, mainPage.getTitle())); - segmentService.segment(consumerUrl, createMainPage.getTitle()); + Wiki wiki = new CreateWikiCommand(prevalence, segmentService, pageApi, consumerUrl) + .setVoicesPerLanguage(null) + .setMainPagePriority(mainPagePriority) + .setTimestampOfLastRecentChangesItemProcessed(OffsetDateTime.now().minusMinutes(initialLastRecentChangesLimitInMinutes)) + .setMaximumSynthesizedVoiceAge(Duration.ofDays(maximumSynthesizedVoiceAgeInDays)) + .execute(); return ResponseEntity.ok(objectMapper.writeValueAsString(wiki)); } @@ -149,12 +121,6 @@ private WikispeechApi.Segment getSegment(String consumerUrl, String title, byte[ return null; } - @RequestMapping( - method = RequestMethod.GET, - path = "synthesis", - produces = "application/json" - ) - @Data public static class SynthesisErrorsResponse { private int totalHits; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/CreateWikiCommand.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/CreateWikiCommand.java new file mode 100644 index 0000000..cdd8c39 --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/CreateWikiCommand.java @@ -0,0 +1,110 @@ +package se.wikimedia.wikispeech.prerender.service; + +import se.wikimedia.wikispeech.prerender.mediawiki.PageApi; +import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; +import se.wikimedia.wikispeech.prerender.service.prevalence.query.GetWiki; +import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.CreateNonSegmentedPage; +import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.CreateWiki; +import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.SetWikiMainPage; +import se.wikimedia.wikispeech.prerender.site.WikiResolver; + +import java.time.Duration; +import java.time.OffsetDateTime; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.Set; + +public class CreateWikiCommand { + + private final Prevalence prevalence; + private final PageApi pageApi; + private final SegmentService segmentService; + + private final String consumerUrl; + + private OffsetDateTime timestampOfLastRecentChangesItemProcessed = OffsetDateTime.now(); + + private Duration maximumSynthesizedVoiceAge = Duration.ofDays(30); + + private Map> voicesPerLanguage = null; + + private Float mainPagePriority = 10F; + + public CreateWikiCommand( + Prevalence prevalence, + SegmentService segmentService, + PageApi pageApi, + + String consumerUrl + ) { + this.prevalence = prevalence; + this.pageApi = pageApi; + this.segmentService = segmentService; + + this.consumerUrl = consumerUrl; + } + + public Wiki execute() throws Exception { + Wiki wiki = prevalence.execute(new GetWiki(consumerUrl)); + if (wiki != null) + throw new IllegalStateException("Wiki with consumer URL '"+consumerUrl+"' already exists"); + WikiResolver resolver = new WikiResolver(); + resolver.detect(consumerUrl); + + PageApi.PageInfo pageInfo = pageApi.getPageInfo(consumerUrl, resolver.getMainPageTitle()); + + CreateWiki createWiki = new CreateWiki(); + createWiki.setConsumerUrl(consumerUrl); + createWiki.setName(resolver.getWikiName()); + createWiki.setDefaultLanguage(pageInfo.getPageLanguage()); + Set namespaces = new LinkedHashSet<>(); + namespaces.add(0); + namespaces.add(pageInfo.getNamespaceIdentity()); + createWiki.setPollRecentChangesNamespaces(new ArrayList<>(namespaces)); + createWiki.setTimestampOfLastRecentChangesItemProcessed(timestampOfLastRecentChangesItemProcessed); + createWiki.setMaximumSynthesizedVoiceAge(maximumSynthesizedVoiceAge); + if (voicesPerLanguage == null) + createWiki.setVoicesPerLanguage(resolver.getDefaultVoicesByLanguage()); + else + createWiki.setVoicesPerLanguage(voicesPerLanguage); + + wiki = prevalence.execute(createWiki); + + CreateNonSegmentedPage createMainPage = new CreateNonSegmentedPage(); + createMainPage.setConsumerUrl(wiki.getConsumerUrl()); + createMainPage.setTitle(resolver.getMainPageTitle()); + createMainPage.setLanguageAtSegmentation(pageInfo.getPageLanguage()); + createMainPage.setRevisionAtSegmentation(pageInfo.getLastRevisionIdentity()); + createMainPage.setPriority(mainPagePriority); + Page mainPage = prevalence.execute(createMainPage); + + prevalence.execute(new SetWikiMainPage(consumerUrl, mainPage.getTitle())); + + segmentService.segment(consumerUrl, createMainPage.getTitle()); + + return wiki; + } + + public CreateWikiCommand setTimestampOfLastRecentChangesItemProcessed(OffsetDateTime timestampOfLastRecentChangesItemProcessed) { + this.timestampOfLastRecentChangesItemProcessed = timestampOfLastRecentChangesItemProcessed; + return this; + } + + public CreateWikiCommand setMaximumSynthesizedVoiceAge(Duration maximumSynthesizedVoiceAge) { + this.maximumSynthesizedVoiceAge = maximumSynthesizedVoiceAge; + return this; + } + + public CreateWikiCommand setVoicesPerLanguage(Map> voicesPerLanguage) { + this.voicesPerLanguage = voicesPerLanguage; + return this; + } + + public CreateWikiCommand setMainPagePriority(Float mainPagePriority) { + this.mainPagePriority = mainPagePriority; + return this; + } +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java index 60d0c5e..86d63db 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java @@ -1,7 +1,6 @@ package se.wikimedia.wikispeech.prerender.service; import lombok.Data; -import okhttp3.OkHttpClient; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.prevayler.Query; @@ -13,7 +12,6 @@ import se.wikimedia.wikispeech.prerender.mediawiki.PageApi; import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; import se.wikimedia.wikispeech.prerender.site.ScrapePageForWikiLinks; @@ -98,5 +96,4 @@ public ConsumerUrlAndTitle(String consumerUrl, String title) { this.title = title; } } - } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/RecentChangesService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/RecentChangesService.java index 8be2bee..b6e4587 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/RecentChangesService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/RecentChangesService.java @@ -1,5 +1,6 @@ package se.wikimedia.wikispeech.prerender.service; +import lombok.Data; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.prevayler.Query; @@ -29,17 +30,22 @@ public class RecentChangesService extends ExecutorService implements SmartLifecy private final Prevalence prevalence; private final SegmentService segmentService; private final RecentChangesApi recentChangesApi; + private final PriorityService priorityService; private final Map lastProcessedRecentChangesItemByConsumerUrl; + @Autowired public RecentChangesService( - @Autowired Prevalence prevalence, - @Autowired SegmentService segmentService, - @Autowired RecentChangesApi recentChangesApi + Prevalence prevalence, + SegmentService segmentService, + RecentChangesApi recentChangesApi, + PriorityService priorityService + ) { this.prevalence = prevalence; this.segmentService = segmentService; this.recentChangesApi = recentChangesApi; + this.priorityService = priorityService; this.lastProcessedRecentChangesItemByConsumerUrl = new ConcurrentHashMap<>(); } @@ -115,6 +121,9 @@ private boolean poll(Wiki wiki) throws Exception { RecentChangesApi.Item lastProcessedRecentChangesItemAtStart = lastProcessedRecentChangesItemByConsumerUrl.get(wiki.getConsumerUrl()); + LocalDateTime now = LocalDateTime.now(); + LocalDateTime future = now.plus(Duration.ofDays(1)); + recentChangesApi.get( wiki.getConsumerUrl(), wiki.getPollRecentChangesNamespaces(), @@ -146,6 +155,16 @@ public boolean collect(RecentChangesApi.Item recentlyChanged) { recentlyChanged.getTitle() ) ) { log.debug("Queued command to segment based on {} at {}", recentlyChanged, wiki.getName()); + + // todo: If change was made by bot, then set low multiplier. Perhaps 0.5f? +// priorityService.put( +// new ConsumerUrlAndTitle(wiki.getConsumerUrl(), recentlyChanged.getTitle()), +// new PriorityService.PagePrioritySetting( +// now, future, 0.5f, +// wiki.getConsumerUrl(), recentlyChanged.getTitle() +// ) +// ); + } else { log.trace("The queue already contains a command to segment based on {} at {}", recentlyChanged, wiki.getName()); } @@ -176,5 +195,16 @@ public boolean collect(RecentChangesApi.Item recentlyChanged) { } } + @Data + private static class ConsumerUrlAndTitle { + private String consumerUrl; + private String title; + + public ConsumerUrlAndTitle(String consumerUrl, String title) { + this.consumerUrl = consumerUrl; + this.title = title; + } + } + } From b36ad78eb63126bc60d038ec3b1da31cd6d303e4 Mon Sep 17 00:00:00 2001 From: kalle Date: Tue, 14 Mar 2023 10:58:07 +0100 Subject: [PATCH 2/8] Better documentation Attempts to limit use of RAM so that heap doesn't grow greater than 1GB. Minor bugfixes --- README.md | 78 +++++++++- register-wiki.sh | 2 +- run.sh | 4 +- .../prerender/WebAppConfiguration.java | 2 + .../service/MainPageLinksPrioritizer.java | 8 +- .../prerender/service/PageCleanupService.java | 41 +++++ .../prerender/service/SegmentService.java | 4 + .../prerender/service/SynthesizeService.java | 145 ++++++++++-------- .../service/prevalence/Prevalence.java | 28 ++++ .../prevalence/domain/state/PageSegment.java | 1 + .../prevalence/transaction/FlushPages.java | 60 ++++++++ .../transaction/FlushSegmentPageVoice.java | 50 ++++++ 12 files changed, 353 insertions(+), 70 deletions(-) create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPages.java create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushSegmentPageVoice.java diff --git a/README.md b/README.md index dc1b103..500750f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,82 @@ # Wikispeech-Prerender -Starting from scratch will +## Installing and running + +### Requirements + +``` +apt-get install maven openjdk-11-jdk +``` + +This service keeps the state in RAM using [system prevalence pattern](https://en.wikipedia.org/wiki/System_prevalence) (rather than using a database). +The service has been coded in a way that it hopefully shouldn't grow the heap larger than 1GB, +but there is no guarantee that this limit won't be exceeded. If you have a lot of RAM on the machine, +consider increasing the -Xmx value in ```run.sh```. + +### For the first time +``` +./run.sh (start service on port 9090) +./register-wiki.sh [consumer url, defaults to svwp] +``` + + +### When installed +``` +./run.sh +``` + +### Clear state and start from scratch + +The system state is store in directory ```prevalence```. To start from scratch, +simply stop service, delete the directory ```prevalence``` start the service again. +You will at this point once again have to register the wikis you want to pre-render. + +``` +rm -rf prevalence +./run.sh +./register-wiki.sh +``` + + +## What this service does + +* It finds pages to segment and synthesize by + * Polling main page metadata once every minute to detect updates. + (This could be improved by listening at recent changes, but that requires consideration). + * Harvesting wiki links from main page. + * Polling for updated pages from recent changes. + +All you need to do is to register the "consumer URL" of a wiki (eg ```https://sv.wikipedia.org/w```), and this service will figure everything else out: languages, voices, etc. + +The selected order to synthesize segments is evaluated from priority settings: + +* The further down a segment occurs on a page (the greater the segment index), + the less priority the segment receives. This is a minuscule change of priority. +* Pages linked from main page get a multiplication factor of 5 to all segments. +* Main page get a multiplication factor of 10 on all segments. + +Basically this means the following order when synthesizing: +1. All segements in the main page. +2. The first segment in pages linked from the current main page. +3. The second segment in pages linked from the current main page. +4. ... until all segments in all pages links from the current main page is synthesized. +5. The first segment in pages found in recent changes. +6. The second segment in pages found in recent changes. +7. ... until all segments in all pages found in recent changes has been synthesized. + +As the number of candidates to be synthesized can grow very large in a rather short time, +a flushing mechanism kicks in when there are more than 100,000 candidates in the queue, +removing those with the lowest priority and retains the top 100,000. + +## TODO + +* Add feature in Wikispeech to not send audio response on synthesis, in order to minimize network data. +* Flush out pages that have not been updated for x days +* Make the 100000 value in a property file or something (hard coded in SynthesizeService.java) +* Make default priorities configurable in properties file (hard coded 5F in MainPagePrioritizer.java) +* Report state of candidate, flushing, etc to influxdb. + + ## REST diff --git a/register-wiki.sh b/register-wiki.sh index 9b197b0..be5406a 100755 --- a/register-wiki.sh +++ b/register-wiki.sh @@ -6,4 +6,4 @@ else consumerUrl=$1 fi -curl -d "consumerUrl=${consumerUrl}&initialLastRecentChangesLimitInMinutes=0&mainPagePriority=10&maximumSynthesizedVoiceAgeInDays=30" -X POST http://localhost:8080/api/wiki +curl -d "consumerUrl=${consumerUrl}&initialLastRecentChangesLimitInMinutes=0&mainPagePriority=10&maximumSynthesizedVoiceAgeInDays=30" -X POST http://localhost:9090/api/wiki diff --git a/run.sh b/run.sh index 2365053..f501588 100755 --- a/run.sh +++ b/run.sh @@ -1,4 +1,4 @@ #!/bin/bash -export MAVEN_OPTS="-Xmx2g" +export MAVEN_OPTS="-Xmx1g" mvn clean install -mvn exec:java -Dinflux.username="" -Dinflux.password="" -Dexec.mainClass="se.wikimedia.wikispeech.prerender.WebApp" +mvn exec:java -Dinflux.username="" -Dinflux.password="" -Dexec.mainClass="se.wikimedia.wikispeech.prerender.WebApp" -Dserver.port="9090" diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java b/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java index a780ee5..616faf0 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java @@ -11,6 +11,7 @@ import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; import org.springframework.context.annotation.Configuration; +import org.springframework.scheduling.annotation.EnableScheduling; import org.springframework.scheduling.annotation.SchedulingConfigurer; import org.springframework.scheduling.config.ScheduledTaskRegistrar; @@ -20,6 +21,7 @@ import java.util.concurrent.TimeUnit; @Configuration +@EnableScheduling @EnableAutoConfiguration(exclude = {DataSourceAutoConfiguration.class}) @ComponentScan(basePackages = "se.wikimedia.wikispeech.prerender") public class WebAppConfiguration implements SchedulingConfigurer { diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java index 86d63db..2bc4d95 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java @@ -47,13 +47,15 @@ public MainPageLinksPrioritizer( private final Map lastChangedByWikiConsumerUrl = new HashMap<>(); + private boolean initialRun = false; + @Scheduled(fixedDelay = 1, timeUnit = TimeUnit.MINUTES, initialDelay = 0) public void run() throws Exception { LocalDateTime now = LocalDateTime.now(); LocalDateTime future = now.plus(Duration.ofDays(1)); for (Wiki wiki : prevalence.execute(new Query>() { @Override - public Set query(Root root, Date date) throws Exception { + public Set query(Root root, Date date) { Set wikis = new HashSet<>(root.getWikiByConsumerUrl().values()); wikis.removeIf( w -> w.getMainPage() == null); return wikis; @@ -61,7 +63,9 @@ public Set query(Root root, Date date) throws Exception { })) { OffsetDateTime lastChanged = OffsetDateTime.parse(pageApi.getHttpHeaders(wiki.getConsumerUrl(), wiki.getMainPage().getTitle()).get("Last-Modified"), DateTimeFormatter.RFC_1123_DATE_TIME); OffsetDateTime previousLastChanged = lastChangedByWikiConsumerUrl.put(wiki.getConsumerUrl(), lastChanged); - if (!lastChanged.equals(previousLastChanged)) { + if (initialRun || !lastChanged.equals(previousLastChanged)) { + // priority service is not persistent, we need to reapply priority from main page on restart. + initialRun = false; log.info("Setting priority for links in {} of {}", wiki.getMainPage().getTitle(), wiki.getName()); ScrapePageForWikiLinks scraper = new ScrapePageForWikiLinks(); scraper.setConsumerUrl(wiki.getConsumerUrl()); diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java new file mode 100644 index 0000000..b2147e7 --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java @@ -0,0 +1,41 @@ +package se.wikimedia.wikispeech.prerender.service; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.prevayler.Query; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Service; +import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; + +import java.time.Duration; +import java.time.LocalDateTime; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; + +@Service +public class PageCleanupService { + + private final Logger log = LogManager.getLogger(getClass()); + + private final Prevalence prevalence; + + private final Duration maximumPageChangeAge = Duration.ofDays(7); + + @Autowired + public PageCleanupService(Prevalence prevalence) { + this.prevalence = prevalence; + } + + @Scheduled(initialDelay = 10, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) + public void cleanup() throws Exception { + + } + +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/SegmentService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/SegmentService.java index 2411561..1d4b9a0 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/SegmentService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/SegmentService.java @@ -190,4 +190,8 @@ public boolean collect(WikispeechApi.Segment segment) { } + public void flushSegments() { + // todo delete old segments that has not been touched for a very long time. + } + } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java index 7ea97cf..acb2573 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java @@ -8,16 +8,20 @@ import org.prevayler.Query; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.SmartLifecycle; +import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import se.wikimedia.wikispeech.prerender.mediawiki.WikispeechApi; import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.command.SegmentPage; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegment; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegmentVoice; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; +import se.wikimedia.wikispeech.prerender.service.prevalence.query.PageNeedsToBeSegmented; import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.AddSegmentVoiceFailure; import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.CreateOrUpdatePageSegmentVoice; +import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.FlushSegmentPageVoice; import java.io.PrintWriter; import java.io.StringWriter; @@ -26,7 +30,7 @@ import java.util.*; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.CountDownLatch; -import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.TimeUnit; @Service public class SynthesizeService extends AbstractLifecycle implements SmartLifecycle { @@ -44,22 +48,30 @@ public class SynthesizeService extends AbstractLifecycle implements SmartLifecyc private final PriorityService priorityService; + private final SegmentService segmentService; + + /** + * If gathered candidates is more than this number, + * then cut candidates at end of queue and flush from prevalence. + */ + private final int flushCandidatesThreshold = 100000; + private final int numberOfWorkerThreads = 2; private final int maximumNumberOfCandidates = 250; private ExecutorService workers; - private CountDownLatch populatorStoppedLatch; - private Thread populator; - + @Autowired public SynthesizeService( - @Autowired Prevalence prevalence, - @Autowired WikispeechApi wikispeechApi, - @Autowired PriorityService priorityService + Prevalence prevalence, + WikispeechApi wikispeechApi, + PriorityService priorityService, + SegmentService segmentService ) { this.prevalence = prevalence; this.wikispeechApi = wikispeechApi; this.priorityService = priorityService; + this.segmentService = segmentService; workers = new ExecutorService() { @Override @@ -77,75 +89,68 @@ protected void execute() { } } }; - - populator = new Thread(new Runnable() { - @Override - public void run() { - while (isRunning()) { - if (!queue.isEmpty()) { - try { - Thread.sleep(Duration.ofSeconds(5).toMillis()); - } catch (InterruptedException ie) { - log.info("Interrupted while waiting for queue to be processed.", ie); - return; - } - } else { - List candidates; - try { - long started = System.currentTimeMillis(); - candidates = prevalence.execute( - new GatherCandidatesQuery()); - long millisecondsSpend = System.currentTimeMillis() - started; - log.debug("Gathered {} segments to synthesize in {} milliseconds.", candidates.size(), millisecondsSpend); - } catch (Exception e) { - log.error("Failed to gather candidates for synthesis", e); - continue; - } - // todo if empty, then sleep for a while - long started = System.currentTimeMillis(); - candidates.sort(new GatherCandidatesQuery.SegmentVoiceToBeSynthesizedComparator(priorityService, true)); - long millisecondsSpend = System.currentTimeMillis() - started; - log.debug("Prioritized {} segments to synthesize in {} milliseconds.", candidates.size(), millisecondsSpend); - queue.addAll(candidates); - SynthesizeService.this.candidates = candidates; - - if (candidates.isEmpty()) { - try { - Thread.sleep(10000); - } catch (InterruptedException ie) { - log.error("Interrupted while pausing to await new segments.", ie); - return; - } - } - - } - } - populatorStoppedLatch.countDown(); - log.info("Thread stops."); - } - }); } + @Override protected void doStart() { - populatorStoppedLatch = new CountDownLatch(1); - populator.start(); workers.start(); } @Override protected void doStop() { + workers.stop(); + } + + @Scheduled(initialDelay = 10, fixedDelay = 60 * 5, timeUnit = TimeUnit.SECONDS) + public void repopulateCandidates() { + log.info("Repopulating candidates to be synthesized..."); + List candidates; try { - populatorStoppedLatch.await(); - } catch (InterruptedException ie) { - log.warn("Interrupted while waiting for populator to stop", ie); + long started = System.currentTimeMillis(); + candidates = prevalence.execute(new GatherCandidatesQuery()); + long millisecondsSpend = System.currentTimeMillis() - started; + log.debug("Gathered {} segments to synthesize in {} milliseconds.", candidates.size(), millisecondsSpend); + } catch (Exception e) { + log.error("Failed to gather candidates for synthesis", e); + return; } - workers.stop(); + + long started = System.currentTimeMillis(); + candidates.sort(new GatherCandidatesQuery.SegmentVoiceToBeSynthesizedComparator(priorityService, false)); + long millisecondsSpend = System.currentTimeMillis() - started; + log.debug("Prioritized {} segments to synthesize in {} milliseconds.", candidates.size(), millisecondsSpend); + + if (candidates.size() >= flushCandidatesThreshold) { + log.info("There are {} candidates, {} will be flushed...", candidates.size(), candidates.size() - flushCandidatesThreshold); + List candidatesToBeFlushed = candidates.subList(flushCandidatesThreshold, candidates.size() - 1); + Map> pagesTouchedPerWiki = new HashMap<>(); + int segmentVoicesFlushed = 0; + for (CandidateToBeSynthesized candidate : candidatesToBeFlushed) { + Set pagesTouched = pagesTouchedPerWiki.computeIfAbsent(candidate.getWiki(), k -> new HashSet<>(10000)); + pagesTouched.add(candidate.getPage()); + // remove pagesegementvoice from pagesegment + // and remove segment from page if no voices left + // and remove page from wiki if no segments left + prevalence.execute(new FlushSegmentPageVoice( + candidate.getWiki().getConsumerUrl(), + candidate.getPage().getTitle(), + candidate.getPageSegment().getHash(), + candidate.getVoice())); + segmentVoicesFlushed++; + } + log.info("Flushed out {} voices from {} pages in {} wikis", segmentVoicesFlushed, pagesTouchedPerWiki.values().stream().mapToInt(Set::size).sum(), pagesTouchedPerWiki.size()); + candidates = candidates.subList(0, flushCandidatesThreshold); + } + + queue.addAll(candidates); + SynthesizeService.this.candidates = candidates; + } private void synthesize(CandidateToBeSynthesized candidate) { try { - log.info("Synthesizing voice {} for hash {} in page {} at {}", candidate.getVoice(), candidate.getPageSegment().getHash(), candidate.getPage().getTitle(),candidate.getWiki().getConsumerUrl()); + log.info("Synthesizing voice '{}' for hash {} at segment index {} in page '{}' of wiki {} with priority {}", candidate.getVoice(), Base64.getEncoder().encodeToString(candidate.getPageSegment().getHash()), candidate.getPageSegment().getLowestIndexAtSegmentation(), candidate.getPage().getTitle(), candidate.getWiki().getConsumerUrl(), candidate.getPriority().getValue()); WikispeechApi.ListenResponseEnvelope responseEnvelope = wikispeechApi.listen( candidate.getWiki().getConsumerUrl(), @@ -156,15 +161,27 @@ private void synthesize(CandidateToBeSynthesized candidate) { candidate.getVoice() ); prevalence.execute( - new CreateOrUpdatePageSegmentVoice( + new FlushSegmentPageVoice( candidate.getWiki().getConsumerUrl(), candidate.getPage().getTitle(), candidate.getPageSegment().getHash(), - responseEnvelope.getRevision(), candidate.getVoice() )); } catch (Exception e) { - log.error("Failed to synthesize {}", candidate, e); + + if (e instanceof WikispeechApi.MWException) { + WikispeechApi.MWException mwException = (WikispeechApi.MWException) e; + if (mwException.getError().get("error").hasNonNull("errorclass")) { + String errorClass = mwException.getError().get("error").get("errorclass").textValue(); + if ("MediaWiki\\\\Wikispeech\\\\Segment\\\\OutdatedOrInvalidRevisionException".equals(errorClass)) { + // send page back to segmentation + segmentService.queue(candidate.getWiki().getConsumerUrl(), candidate.getPage().getTitle()); + return; + } + } + } + + log.error("Failed to synthesize voice '{}' for hash {} at segment index {} in page '{}' of wiki {} with priority {}", candidate.getVoice(), Base64.getEncoder().encodeToString(candidate.getPageSegment().getHash()), candidate.getPageSegment().getLowestIndexAtSegmentation(), candidate.getPage().getTitle(), candidate.getWiki().getConsumerUrl(), candidate.getPriority().getValue(), e); StringWriter stacktrace = new StringWriter(4096); stacktrace.append(e.getMessage()); stacktrace.append("\n"); diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java index 86ca379..4b9d602 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java @@ -5,6 +5,7 @@ import org.apache.logging.log4j.Logger; import org.prevayler.*; import org.springframework.context.SmartLifecycle; +import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import se.wikimedia.wikispeech.prerender.service.AbstractLifecycle; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; @@ -17,6 +18,9 @@ import java.time.LocalDateTime; import java.time.ZoneId; import java.util.Date; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; @Service public class Prevalence extends AbstractLifecycle implements SmartLifecycle { @@ -96,5 +100,29 @@ private void initializePrevalence() throws Exception { // execute(createWiki); } + private static final Pattern journalFileNamePattern = Pattern.compile("(\\d+)\\.journal"); + + @Scheduled(fixedDelay = 7, initialDelay = 1, timeUnit = TimeUnit.DAYS) + public void snapshotAndRemoveJournals() throws Exception { + File snapshot = prevalyer.takeSnapshot(); + Long currentJournal = null; + for (File file : snapshot.getParentFile().listFiles()) { + if (!file.equals(snapshot)) { + Matcher matcher = journalFileNamePattern.matcher(file.getName()); + if (matcher.matches()) { + long journalIdentity = Long.parseLong(matcher.group(1)); + if (currentJournal == null || journalIdentity > currentJournal) { + currentJournal = journalIdentity; + } else { + log.info("Removing {}", file.getName()); + file.delete(); + } + } else { + log.info("Removing {}", file.getName()); + file.delete(); + } + } + } + } } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java index f2b6ea6..667e702 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java @@ -3,6 +3,7 @@ import lombok.Data; import java.io.Serializable; +import java.time.OffsetDateTime; import java.util.ArrayList; import java.util.List; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPages.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPages.java new file mode 100644 index 0000000..7339892 --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPages.java @@ -0,0 +1,60 @@ +package se.wikimedia.wikispeech.prerender.service.prevalence.transaction; + +import lombok.Data; +import org.prevayler.Transaction; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegment; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegmentVoice; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; + +import java.time.LocalDateTime; +import java.util.Date; +import java.util.Iterator; + +@Data +public class FlushPages implements Transaction { + + private static final long serialVersionUID = 1L; + + private LocalDateTime earliestAllowedTimestampSegmented; + + public FlushPages() { + } + + public FlushPages(LocalDateTime earliestAllowedTimestampSegmented) { + this.earliestAllowedTimestampSegmented = earliestAllowedTimestampSegmented; + } + + @Override + public void executeOn(Root root, Date executionTime) { + for (Wiki wiki : root.getWikiByConsumerUrl().values()) { + for (Iterator pages = wiki.getPagesByTitle().values().iterator(); pages.hasNext(); ) { + Page page = pages.next(); + if (page.getTimestampSegmented() != null + && page.getTimestampSegmented().isBefore(earliestAllowedTimestampSegmented)) { + + boolean pageShouldBeRemoved = true; + if (page.getSegments() != null && !page.getSegments().isEmpty()) { + for (PageSegment segment : page.getSegments()) { + if (segment.getSynthesizedVoices() != null && !segment.getSynthesizedVoices().isEmpty()) { + for (PageSegmentVoice voice : segment.getSynthesizedVoices()) { + if (voice.getFailedAttempts() == null || voice.getFailedAttempts().isEmpty()) { + pageShouldBeRemoved = false; + break; + } + } + } + if (!pageShouldBeRemoved) break; + } + } + + if (pageShouldBeRemoved) + pages.remove(); + + } + + } + } + } +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushSegmentPageVoice.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushSegmentPageVoice.java new file mode 100644 index 0000000..152ba1d --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushSegmentPageVoice.java @@ -0,0 +1,50 @@ +package se.wikimedia.wikispeech.prerender.service.prevalence.transaction; + +import lombok.Data; +import org.prevayler.Transaction; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegment; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; + +import java.util.Arrays; +import java.util.Date; +import java.util.Iterator; + +@Data +public class FlushSegmentPageVoice implements Transaction { + + private static final long serialVersionUID = 1L; + + private String wikiConsumerUrl; + private String pageTitle; + private byte[] segmentHash; + private String voice; + + public FlushSegmentPageVoice() { + } + + public FlushSegmentPageVoice(String wikiConsumerUrl, String pageTitle, byte[] segmentHash, String voice) { + this.wikiConsumerUrl = wikiConsumerUrl; + this.pageTitle = pageTitle; + this.segmentHash = segmentHash; + this.voice = voice; + } + + @Override + public void executeOn(Root root, Date executionTime) { + Wiki wiki = root.getWikiByConsumerUrl().get(wikiConsumerUrl); + Page page = wiki.getPagesByTitle().get(pageTitle); + + for (Iterator pageSegments = page.getSegments().iterator(); pageSegments.hasNext(); ) { + PageSegment pageSegment = pageSegments.next(); + if (Arrays.equals(segmentHash, pageSegment.getHash())) { + pageSegment.getSynthesizedVoices().removeIf(pageSegmentVoice -> voice.equals(pageSegmentVoice.getVoice())); + if (pageSegment.getSynthesizedVoices() == null || pageSegment.getSynthesizedVoices().isEmpty()) + pageSegments.remove(); + break; + } + } + } + +} From 67a1d2b46f8436755fdc47473859122fc4426899 Mon Sep 17 00:00:00 2001 From: kalle Date: Tue, 14 Mar 2023 11:47:21 +0100 Subject: [PATCH 3/8] Removed deprecated command pattern Optimize namespace imports --- .../prerender/mediawiki/WikispeechApi.java | 1 - .../prerender/rest/MainController.java | 5 +++- .../prerender/service/AbstractLifecycle.java | 1 + .../wikispeech/prerender/service/Influx.java | 4 --- .../prerender/service/PageCleanupService.java | 9 ------- .../prerender/service/PriorityService.java | 5 +++- .../service/RecentChangesService.java | 5 +++- .../prerender/service/SegmentService.java | 7 +++-- .../prerender/service/SynthesizeService.java | 5 ---- .../service/prevalence/Prevalence.java | 3 --- .../service/prevalence/domain/Root.java | 7 ----- .../prevalence/domain/command/Command.java | 17 ------------ .../domain/command/CommandVisitor.java | 11 -------- .../prevalence/domain/command/CrawlSite.java | 22 ---------------- .../domain/command/PollRecentChanges.java | 22 ---------------- ...nksAndQueueLinkedPagesForSegmentation.java | 25 ------------------ .../domain/command/SegmentPage.java | 18 ------------- .../command/SynthesizeSegmentVoice.java | 26 ------------------- .../prevalence/domain/state/PageSegment.java | 1 - .../prevalence/query/FindSegmentedPage.java | 2 +- .../transaction/AddSegmentVoiceFailure.java | 2 -- .../transaction/CreateNonSegmentedPage.java | 1 - .../transaction/SetWikiMainPage.java | 1 - .../SpecialPagesWithMostVersionsParser.java | 2 -- .../prerender/site/WikiResolver.java | 2 -- 25 files changed, 19 insertions(+), 185 deletions(-) delete mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/Command.java delete mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/CommandVisitor.java delete mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/CrawlSite.java delete mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/PollRecentChanges.java delete mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/ScrapePageForWikiLinksAndQueueLinkedPagesForSegmentation.java delete mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/SegmentPage.java delete mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/SynthesizeSegmentVoice.java diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java index 9eb2a15..a1e972b 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java @@ -1,6 +1,5 @@ package se.wikimedia.wikispeech.prerender.mediawiki; -import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java b/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java index 30d4d91..30c58aa 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java @@ -9,7 +9,10 @@ import org.prevayler.Query; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.ResponseEntity; -import org.springframework.web.bind.annotation.*; +import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestMethod; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; import se.wikimedia.wikispeech.prerender.Collector; import se.wikimedia.wikispeech.prerender.LocalCache; import se.wikimedia.wikispeech.prerender.mediawiki.PageApi; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/AbstractLifecycle.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/AbstractLifecycle.java index 17e7a8c..63f3c0d 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/AbstractLifecycle.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/AbstractLifecycle.java @@ -3,6 +3,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.springframework.context.Lifecycle; + import java.util.concurrent.atomic.AtomicBoolean; public abstract class AbstractLifecycle implements Lifecycle { diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/Influx.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/Influx.java index 060c18d..6911238 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/Influx.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/Influx.java @@ -5,12 +5,8 @@ import org.influxdb.InfluxDBFactory; import org.influxdb.dto.Point; import org.influxdb.dto.Query; -import org.springframework.context.SmartLifecycle; import org.springframework.stereotype.Service; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; - @Service public class Influx extends AbstractLifecycle { // uncomment to startup implements SmartLifecycle { diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java index b2147e7..724c1b5 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java @@ -2,21 +2,12 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.prevayler.Query; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; import java.time.Duration; -import java.time.LocalDateTime; -import java.util.Date; -import java.util.HashMap; -import java.util.Map; -import java.util.Set; import java.util.concurrent.TimeUnit; @Service diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java index 28fa573..0520394 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java @@ -12,7 +12,10 @@ import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; import java.time.LocalDateTime; -import java.util.*; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/RecentChangesService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/RecentChangesService.java index b6e4587..9ac17fe 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/RecentChangesService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/RecentChangesService.java @@ -17,7 +17,10 @@ import java.time.Duration; import java.time.LocalDateTime; -import java.util.*; +import java.util.Collection; +import java.util.Date; +import java.util.HashSet; +import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CountDownLatch; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/SegmentService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/SegmentService.java index 1d4b9a0..3f20efe 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/SegmentService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/SegmentService.java @@ -12,11 +12,14 @@ import se.wikimedia.wikispeech.prerender.mediawiki.WikispeechApi; import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; import se.wikimedia.wikispeech.prerender.service.prevalence.query.*; -import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.CreateSegmentedPage; import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.CreatePageSegment; +import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.CreateSegmentedPage; import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.FinalizedPageSegmented; -import java.util.*; +import java.util.HashSet; +import java.util.Map; +import java.util.Queue; +import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentLinkedQueue; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java index acb2573..6fe01b4 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java @@ -13,23 +13,18 @@ import se.wikimedia.wikispeech.prerender.mediawiki.WikispeechApi; import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.command.SegmentPage; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegment; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegmentVoice; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; -import se.wikimedia.wikispeech.prerender.service.prevalence.query.PageNeedsToBeSegmented; import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.AddSegmentVoiceFailure; -import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.CreateOrUpdatePageSegmentVoice; import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.FlushSegmentPageVoice; import java.io.PrintWriter; import java.io.StringWriter; -import java.time.Duration; import java.time.LocalDateTime; import java.util.*; import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; @Service diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java index 4b9d602..320087e 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java @@ -9,11 +9,8 @@ import org.springframework.stereotype.Service; import se.wikimedia.wikispeech.prerender.service.AbstractLifecycle; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; -import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.CreateWiki; -import se.wikimedia.wikispeech.prerender.site.SwedishWikipedia; import java.io.File; -import java.time.Duration; import java.time.Instant; import java.time.LocalDateTime; import java.time.ZoneId; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/Root.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/Root.java index e4b5bf6..bd33a1d 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/Root.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/Root.java @@ -1,15 +1,11 @@ package se.wikimedia.wikispeech.prerender.service.prevalence.domain; import lombok.Data; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.command.Command; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.command.SynthesizeSegmentVoice; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; import java.io.Serializable; import java.util.HashMap; -import java.util.LinkedList; import java.util.Map; -import java.util.Queue; @Data public class Root implements Serializable { @@ -18,9 +14,6 @@ public class Root implements Serializable { private Map wikiByConsumerUrl = new HashMap<>(); - private Queue commandQueue = new LinkedList<>(); - private Queue synthesizeSegmentsQueue = new LinkedList<>(); - private Intern internedVoices = new Intern<>(); private Intern internedLanguages = new Intern<>(); diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/Command.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/Command.java deleted file mode 100644 index 29cd096..0000000 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/Command.java +++ /dev/null @@ -1,17 +0,0 @@ -package se.wikimedia.wikispeech.prerender.service.prevalence.domain.command; - -import lombok.Data; - -import java.io.Serializable; -import java.time.LocalDateTime; - -@Data -public abstract class Command implements Serializable { - - private static final long serialVersionUID = 1L; - - private LocalDateTime created; - - public abstract R accept(CommandVisitor visitor); - -} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/CommandVisitor.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/CommandVisitor.java deleted file mode 100644 index 99f1b7b..0000000 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/CommandVisitor.java +++ /dev/null @@ -1,11 +0,0 @@ -package se.wikimedia.wikispeech.prerender.service.prevalence.domain.command; - -public interface CommandVisitor { - - public R visit(SegmentPage command); - public R visit(SynthesizeSegmentVoice command); - public R visit(ScrapePageForWikiLinksAndQueueLinkedPagesForSegmentation command); - public R visit(CrawlSite command); - public R visit(PollRecentChanges command); - -} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/CrawlSite.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/CrawlSite.java deleted file mode 100644 index fd1d1c0..0000000 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/CrawlSite.java +++ /dev/null @@ -1,22 +0,0 @@ -package se.wikimedia.wikispeech.prerender.service.prevalence.domain.command; - -import lombok.Data; - -@Data -public class CrawlSite extends Command { - - private static final long serialVersionUID = 1L; - - private String consumerUrl; - private String startingPointTitle; - private int maximumDepth; - - private String linksExpression = "//*[@id='bodyContent']//A[starts-with(@href, '/wiki/')]"; - private String allowedHrefPattern = "/wiki/[^:]+"; - - - @Override - public R accept(CommandVisitor visitor) { - return visitor.visit(this); - } -} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/PollRecentChanges.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/PollRecentChanges.java deleted file mode 100644 index 103594f..0000000 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/PollRecentChanges.java +++ /dev/null @@ -1,22 +0,0 @@ -package se.wikimedia.wikispeech.prerender.service.prevalence.domain.command; - -import lombok.Data; - -import java.time.ZonedDateTime; - -@Data -public class PollRecentChanges extends Command { - - private static final long serialVersionUID = 1L; - - private String consumerUrl; - private ZonedDateTime startTimestamp; - - private String linksExpression = "//*[@id='bodyContent']//A[starts-with(@href, '/wiki/')]"; - private String allowedHrefPattern = "/wiki/[^:]+"; - - @Override - public R accept(CommandVisitor visitor) { - return visitor.visit(this); - } -} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/ScrapePageForWikiLinksAndQueueLinkedPagesForSegmentation.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/ScrapePageForWikiLinksAndQueueLinkedPagesForSegmentation.java deleted file mode 100644 index 7a64551..0000000 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/ScrapePageForWikiLinksAndQueueLinkedPagesForSegmentation.java +++ /dev/null @@ -1,25 +0,0 @@ -package se.wikimedia.wikispeech.prerender.service.prevalence.domain.command; - -import lombok.Data; - -import java.util.Collection; - -@Data -public class ScrapePageForWikiLinksAndQueueLinkedPagesForSegmentation extends Command { - - private static final long serialVersionUID = 1L; - - private String consumerUrl; - private String title; - - private String language; - private Collection voices; - - private String linksExpression = "//*[@id='bodyContent']//A[starts-with(@href, '/wiki/')]"; - private String allowedHrefPattern = "/wiki/[^:]+"; - - @Override - public R accept(CommandVisitor visitor) { - return visitor.visit(this); - } -} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/SegmentPage.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/SegmentPage.java deleted file mode 100644 index 9f3ca94..0000000 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/SegmentPage.java +++ /dev/null @@ -1,18 +0,0 @@ -package se.wikimedia.wikispeech.prerender.service.prevalence.domain.command; - -import lombok.Data; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; - -@Data -public class SegmentPage extends Command { - - private static final long serialVersionUID = 1L; - - private Wiki wiki; - private String title; - - @Override - public R accept(CommandVisitor visitor) { - return visitor.visit(this); - } -} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/SynthesizeSegmentVoice.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/SynthesizeSegmentVoice.java deleted file mode 100644 index 83f671a..0000000 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/command/SynthesizeSegmentVoice.java +++ /dev/null @@ -1,26 +0,0 @@ -package se.wikimedia.wikispeech.prerender.service.prevalence.domain.command; - -import lombok.Data; - -import java.util.List; - -@Data -public class SynthesizeSegmentVoice extends Command { - - private static final long serialVersionUID = 1L; - - private String consumerUrl; - private String title; - private byte[] hash; - private String voice; - - private int contentStartOffset; - private int contentEndOffset; - private List contentXPathExpressions; - private List contentTexts; - - @Override - public R accept(CommandVisitor visitor) { - return visitor.visit(this); - } -} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java index 667e702..f2b6ea6 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java @@ -3,7 +3,6 @@ import lombok.Data; import java.io.Serializable; -import java.time.OffsetDateTime; import java.util.ArrayList; import java.util.List; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/FindSegmentedPage.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/FindSegmentedPage.java index 73195ed..23453c7 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/FindSegmentedPage.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/FindSegmentedPage.java @@ -2,9 +2,9 @@ import lombok.Data; import org.prevayler.Query; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; import java.util.Date; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/AddSegmentVoiceFailure.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/AddSegmentVoiceFailure.java index 69aa142..67da7b3 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/AddSegmentVoiceFailure.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/AddSegmentVoiceFailure.java @@ -2,13 +2,11 @@ import lombok.Data; import org.prevayler.Transaction; -import org.prevayler.TransactionWithQuery; import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegment; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegmentVoice; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; import java.util.Arrays; import java.util.Date; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateNonSegmentedPage.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateNonSegmentedPage.java index 7f068ad..3a83439 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateNonSegmentedPage.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateNonSegmentedPage.java @@ -2,7 +2,6 @@ import lombok.Data; import org.prevayler.TransactionWithQuery; -import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetWikiMainPage.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetWikiMainPage.java index 4db9885..6866b11 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetWikiMainPage.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetWikiMainPage.java @@ -5,7 +5,6 @@ import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; -import java.time.OffsetDateTime; import java.util.Date; @Data diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/site/SpecialPagesWithMostVersionsParser.java b/src/main/java/se/wikimedia/wikispeech/prerender/site/SpecialPagesWithMostVersionsParser.java index 7af7382..b0c5bdd 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/site/SpecialPagesWithMostVersionsParser.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/site/SpecialPagesWithMostVersionsParser.java @@ -11,8 +11,6 @@ import javax.xml.xpath.*; import java.io.IOException; -import java.time.Duration; -import java.time.LocalDateTime; import java.util.regex.Matcher; import java.util.regex.Pattern; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/site/WikiResolver.java b/src/main/java/se/wikimedia/wikispeech/prerender/site/WikiResolver.java index 3dcdb8d..bce04ba 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/site/WikiResolver.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/site/WikiResolver.java @@ -8,8 +8,6 @@ import lombok.Getter; import okhttp3.*; import org.codelibs.nekohtml.parsers.DOMParser; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; import org.w3c.dom.Document; import org.xml.sax.InputSource; import org.xml.sax.SAXException; From 7a0160604217073dead9234db32dbe0ac45ac57c Mon Sep 17 00:00:00 2001 From: kalle Date: Mon, 27 Mar 2023 09:50:49 +0200 Subject: [PATCH 4/8] Removed deprecated command pattern Optimize namespace imports --- README.md | 32 ++++++- run.sh | 2 +- .../prerender/mediawiki/WikispeechApi.java | 44 +++++----- .../prerender/service/CreateWikiCommand.java | 4 +- .../service/MainPageLinksPrioritizer.java | 24 ++++- .../prerender/service/PageCleanupService.java | 32 ------- .../prerender/service/PriorityService.java | 3 +- .../prerender/service/SynthesizeService.java | 75 ++++++++++------ .../service/prevalence/Prevalence.java | 2 +- .../service/prevalence/domain/Root.java | 7 ++ .../service/prevalence/domain/state/Page.java | 19 ++++ .../prevalence/domain/state/PageSegment.java | 8 ++ .../service/prevalence/domain/state/Wiki.java | 13 +++ .../service/prevalence/query/GetPage.java | 28 ++++++ .../prevalence/query/PageSegmentExists.java | 7 +- .../transaction/CreateNonSegmentedPage.java | 6 +- .../transaction/CreateOrGetPage.java | 38 ++++++++ .../CreateOrUpdatePageSegmentVoice.java | 8 +- .../transaction/FindPagesToBeFlushed.java | 88 +++++++++++++++++++ .../transaction/FlushPageSegment.java | 46 ++++++++++ ...eVoice.java => FlushPageSegmentVoice.java} | 6 +- .../prevalence/transaction/FlushPages.java | 43 ++------- .../SetPageTimestampDontFlushUntil.java | 37 ++++++++ .../transaction/SetWikiMainPage.java | 8 +- 24 files changed, 449 insertions(+), 131 deletions(-) delete mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/GetPage.java create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateOrGetPage.java create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FindPagesToBeFlushed.java create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPageSegment.java rename src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/{FlushSegmentPageVoice.java => FlushPageSegmentVoice.java} (91%) create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetPageTimestampDontFlushUntil.java diff --git a/README.md b/README.md index 500750f..253eb54 100644 --- a/README.md +++ b/README.md @@ -64,20 +64,44 @@ Basically this means the following order when synthesizing: 6. The second segment in pages found in recent changes. 7. ... until all segments in all pages found in recent changes has been synthesized. +Candidates to be synthesized is re-evaluated every five minutes. + +### Automatic flushing of segments + As the number of candidates to be synthesized can grow very large in a rather short time, a flushing mechanism kicks in when there are more than 100,000 candidates in the queue, removing those with the lowest priority and retains the top 100,000. +Segments flushing exists in order to save RAM, as the state of the application is kept in heap. + +### Automatic flushing of pages + +After one day of inactivity to a page on a wiki, +the state of rendering for that page will be a candidate for being flushed out. +If there are still segments that have not been synthesized, this occurs after two days. + +Flushing a page means that if there is a change after the flush, +the complete page will be re-synthesized. +(Re-synthesized as in requested to be listened to. Wikispeech backend might in fact be cached.) + +The main page will never be flushed out. + +Pages that are linked to from the main page will not be flushed out until five days after they were last seen on the main page. + +Page flushing exists in order to save RAM, as the state of the application is kept in heap. + +### Failing segment voices + +Will be a candidate to be retried every n hours, where n=number of previous failures. + ## TODO * Add feature in Wikispeech to not send audio response on synthesis, in order to minimize network data. -* Flush out pages that have not been updated for x days -* Make the 100000 value in a property file or something (hard coded in SynthesizeService.java) -* Make default priorities configurable in properties file (hard coded 5F in MainPagePrioritizer.java) +* Add feature in Wikispeech to list all cached segments and voices for a given page, in order to avoid requesting synthesis when not needed. +* Make all hard coded value mentioned above configurable in a properties file or something. * Report state of candidate, flushing, etc to influxdb. - ## REST Most REST calls are exist for debug and development reasons. diff --git a/run.sh b/run.sh index f501588..6808fe5 100755 --- a/run.sh +++ b/run.sh @@ -1,4 +1,4 @@ #!/bin/bash -export MAVEN_OPTS="-Xmx1g" +export MAVEN_OPTS="-Xmx3g" mvn clean install mvn exec:java -Dinflux.username="" -Dinflux.password="" -Dexec.mainClass="se.wikimedia.wikispeech.prerender.WebApp" -Dserver.port="9090" diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java index a1e972b..da00e1b 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java @@ -98,6 +98,10 @@ public String getExceptionClass() { } } + /** + * A cache of the greatest revision known of all pages processed in all wikis + * todo: Make it a guava-cache that flush out oldest items after X MB size + */ private Map> mostRecentRevisionSeenCache = new HashMap<>(); private long getGreatestRevisionKnown(String consumerUrl, String title, long knownRevision) { @@ -121,7 +125,7 @@ public ListenResponseEnvelope listen(String consumerUrl, String title, String se envelope.setResponse(doListen(consumerUrl, segmentHash, greatestRevisionKnown, language, voice)); envelope.setRevision(lastKnownRevision); } catch (MWException mwException) { - if ("MediaWiki\\Wikispeech\\Segment\\OutdatedOrInvalidRevisionException".equals(mwException.getExceptionClass())) { + if (mwException.getError().toString().contains("OutdatedOrInvalidRevisionException")) { greatestRevisionKnown = getCurrentRevision(consumerUrl, title); setGreatestRevisionKnown(consumerUrl, title, greatestRevisionKnown); envelope.setResponse(doListen(consumerUrl, segmentHash, greatestRevisionKnown, language, voice)); @@ -152,30 +156,30 @@ private ListenResponse doListen(String consumerUrl, String segmentHash, long rev Call call = client.newCall(request); Response response = call.execute(); + final ByteArrayOutputStream baos; + try { + if (response.code() != 200) { + throw new IOException("Response" + response); + } - if (response.code() != 200) { - throw new IOException("Response" + response); + baos = new ByteArrayOutputStream(49152); + IOUtils.copy(response.body().byteStream(), baos); + } finally { + response.close(); } - ByteArrayOutputStream baos = new ByteArrayOutputStream(49152); - IOUtils.copy(response.body().byteStream(), baos); - try { - JsonNode json = objectMapper.readTree(baos.toByteArray()); - if (json.has("error")) { - if (json.get("error").has("errorclass")) { - throw new MWException(json); - } - throw new IOException("Wikispeech responded with an error!" + objectMapper.writeValueAsString(json)); + JsonNode json = objectMapper.readTree(baos.toByteArray()); + if (json.has("error")) { + if (json.get("error").has("errorclass")) { + throw new MWException(json); } - ListenResponse listenResponse = objectMapper.convertValue(json.get("wikispeech-listen"), ListenResponse.class); - if (listenResponse == null) { - throw new RuntimeException("Failed to deserialize JSON response!" + objectMapper.writeValueAsString(json)); - } - return listenResponse; - } catch (Exception exception) { - log.error("Failed processing response: {}", new String(baos.toByteArray()), exception); - throw exception; + throw new IOException("Wikispeech responded with an error!" + objectMapper.writeValueAsString(json)); + } + ListenResponse listenResponse = objectMapper.convertValue(json.get("wikispeech-listen"), ListenResponse.class); + if (listenResponse == null) { + throw new RuntimeException("Failed to deserialize JSON response!" + objectMapper.writeValueAsString(json)); } + return listenResponse; } @Data diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/CreateWikiCommand.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/CreateWikiCommand.java index cdd8c39..cc7a652 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/CreateWikiCommand.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/CreateWikiCommand.java @@ -10,8 +10,7 @@ import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.SetWikiMainPage; import se.wikimedia.wikispeech.prerender.site.WikiResolver; -import java.time.Duration; -import java.time.OffsetDateTime; +import java.time.*; import java.util.ArrayList; import java.util.LinkedHashSet; import java.util.Map; @@ -79,6 +78,7 @@ public Wiki execute() throws Exception { createMainPage.setLanguageAtSegmentation(pageInfo.getPageLanguage()); createMainPage.setRevisionAtSegmentation(pageInfo.getLastRevisionIdentity()); createMainPage.setPriority(mainPagePriority); + createMainPage.setTimestampDontFlushUntil(LocalDateTime.ofInstant(Instant.ofEpochMilli(Long.MAX_VALUE), ZoneOffset.UTC)); Page mainPage = prevalence.execute(createMainPage); prevalence.execute(new SetWikiMainPage(consumerUrl, mainPage.getTitle())); diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java index 2bc4d95..5235051 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java @@ -12,7 +12,11 @@ import se.wikimedia.wikispeech.prerender.mediawiki.PageApi; import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; +import se.wikimedia.wikispeech.prerender.service.prevalence.query.GetPage; +import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.CreateOrGetPage; +import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.SetPageTimestampDontFlushUntil; import se.wikimedia.wikispeech.prerender.site.ScrapePageForWikiLinks; import java.time.Duration; @@ -47,12 +51,14 @@ public MainPageLinksPrioritizer( private final Map lastChangedByWikiConsumerUrl = new HashMap<>(); - private boolean initialRun = false; + private boolean initialRun = true; @Scheduled(fixedDelay = 1, timeUnit = TimeUnit.MINUTES, initialDelay = 0) public void run() throws Exception { LocalDateTime now = LocalDateTime.now(); LocalDateTime future = now.plus(Duration.ofDays(1)); + LocalDateTime timestampDontFlushUntil = now.plusDays(5); + for (Wiki wiki : prevalence.execute(new Query>() { @Override public Set query(Root root, Date date) { @@ -64,6 +70,9 @@ public Set query(Root root, Date date) { OffsetDateTime lastChanged = OffsetDateTime.parse(pageApi.getHttpHeaders(wiki.getConsumerUrl(), wiki.getMainPage().getTitle()).get("Last-Modified"), DateTimeFormatter.RFC_1123_DATE_TIME); OffsetDateTime previousLastChanged = lastChangedByWikiConsumerUrl.put(wiki.getConsumerUrl(), lastChanged); if (initialRun || !lastChanged.equals(previousLastChanged)) { + // re-segment main page + segmentService.segment(wiki.getConsumerUrl(), wiki.getMainPage().getTitle()); + // priority service is not persistent, we need to reapply priority from main page on restart. initialRun = false; log.info("Setting priority for links in {} of {}", wiki.getMainPage().getTitle(), wiki.getName()); @@ -73,6 +82,19 @@ public Set query(Root root, Date date) { scraper.setCollector(new Collector() { @Override public boolean collect(String title) { + + // don't flush pages linked from main page in x days. + try { + Page page = prevalence.execute(new GetPage(wiki.getConsumerUrl(), title)); + if (page == null) + page = prevalence.execute(new CreateOrGetPage(wiki.getConsumerUrl(), title)); + if (page.getTimestampDontFlushUntil() == null || page.getTimestampDontFlushUntil().isBefore(timestampDontFlushUntil)) + prevalence.execute(new SetPageTimestampDontFlushUntil(wiki.getConsumerUrl(), title, timestampDontFlushUntil)); + } catch (Exception e) { + log.error("Failed to set timestampDontFlushUntil of page {} linked from main page on wiki {}", title, wiki.getConsumerUrl(), e); + } + + // keep priority of factor 5 for x days priorityService.put( new ConsumerUrlAndTitle(wiki.getConsumerUrl(), title), new PriorityService.PagePrioritySetting( diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java deleted file mode 100644 index 724c1b5..0000000 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/PageCleanupService.java +++ /dev/null @@ -1,32 +0,0 @@ -package se.wikimedia.wikispeech.prerender.service; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.scheduling.annotation.Scheduled; -import org.springframework.stereotype.Service; -import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; - -import java.time.Duration; -import java.util.concurrent.TimeUnit; - -@Service -public class PageCleanupService { - - private final Logger log = LogManager.getLogger(getClass()); - - private final Prevalence prevalence; - - private final Duration maximumPageChangeAge = Duration.ofDays(7); - - @Autowired - public PageCleanupService(Prevalence prevalence) { - this.prevalence = prevalence; - } - - @Scheduled(initialDelay = 10, fixedDelay = 60 * 24, timeUnit = TimeUnit.MINUTES) - public void cleanup() throws Exception { - - } - -} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java index 0520394..292b686 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java @@ -61,10 +61,11 @@ public Boolean visit(PagePrioritySetting setting) { @Scheduled(fixedDelay = 1, timeUnit = TimeUnit.HOURS) public void expunge() { - log.info("Expunging..."); + log.info("Expunging priorities..."); for (Map.Entry entry : new HashSet<>(settings.entrySet())) { if (entry.getValue().getTo().isBefore(LocalDateTime.now())) { settings.remove(entry.getKey()); + log.info("Expunged {}", entry); } } } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java index 6fe01b4..e1432f8 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java @@ -17,12 +17,14 @@ import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegment; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegmentVoice; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; -import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.AddSegmentVoiceFailure; -import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.FlushSegmentPageVoice; +import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.*; import java.io.PrintWriter; import java.io.StringWriter; +import java.time.Duration; +import java.time.Instant; import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.*; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.TimeUnit; @@ -34,6 +36,7 @@ public class SynthesizeService extends AbstractLifecycle implements SmartLifecyc @Getter private List candidates = new ArrayList<>(); + @Getter private final Queue queue = new ConcurrentLinkedQueue<>(); @@ -99,6 +102,21 @@ protected void doStop() { @Scheduled(initialDelay = 10, fixedDelay = 60 * 5, timeUnit = TimeUnit.SECONDS) public void repopulateCandidates() { + + queue.clear(); + + try { + log.info("Searching for pages to be flushed... Must be at least one day old. All older than 2 days will be flushed."); + Map> pageTitlesByWikiConsumerUrl = prevalence.execute(new FindPagesToBeFlushed(Duration.ofDays(1), Duration.ofDays(2))); + if (!pageTitlesByWikiConsumerUrl.isEmpty()) { + prevalence.execute(new FlushPages(pageTitlesByWikiConsumerUrl)); + log.info("Flushed pages {}", pageTitlesByWikiConsumerUrl); + } + } catch (Exception e) { + log.error("Failed to flush old or completely synthesized pages", e); + return; + } + log.info("Repopulating candidates to be synthesized..."); List candidates; try { @@ -117,25 +135,27 @@ public void repopulateCandidates() { log.debug("Prioritized {} segments to synthesize in {} milliseconds.", candidates.size(), millisecondsSpend); if (candidates.size() >= flushCandidatesThreshold) { - log.info("There are {} candidates, {} will be flushed...", candidates.size(), candidates.size() - flushCandidatesThreshold); - List candidatesToBeFlushed = candidates.subList(flushCandidatesThreshold, candidates.size() - 1); + log.info("There are {} candidates, ~{} will be flushed...", candidates.size(), candidates.size() - flushCandidatesThreshold); + List candidatesToBeFlushed = new ArrayList<>(candidates.subList(flushCandidatesThreshold, candidates.size() - 1)); Map> pagesTouchedPerWiki = new HashMap<>(); - int segmentVoicesFlushed = 0; + int segmentsFlushed = 0; for (CandidateToBeSynthesized candidate : candidatesToBeFlushed) { Set pagesTouched = pagesTouchedPerWiki.computeIfAbsent(candidate.getWiki(), k -> new HashSet<>(10000)); pagesTouched.add(candidate.getPage()); - // remove pagesegementvoice from pagesegment - // and remove segment from page if no voices left - // and remove page from wiki if no segments left - prevalence.execute(new FlushSegmentPageVoice( + + // this flush the segment, not the voice! + // we also remove all other voices for this segment in the candidates list! + + prevalence.execute(new FlushPageSegment( candidate.getWiki().getConsumerUrl(), candidate.getPage().getTitle(), - candidate.getPageSegment().getHash(), - candidate.getVoice())); - segmentVoicesFlushed++; + candidate.getPageSegment().getHash())); + + candidates.removeIf(c -> c.getPageSegment() == candidate.getPageSegment()); + + segmentsFlushed++; } - log.info("Flushed out {} voices from {} pages in {} wikis", segmentVoicesFlushed, pagesTouchedPerWiki.values().stream().mapToInt(Set::size).sum(), pagesTouchedPerWiki.size()); - candidates = candidates.subList(0, flushCandidatesThreshold); + log.info("Flushed out {} segments from {} pages in {} wikis", segmentsFlushed, pagesTouchedPerWiki.values().stream().mapToInt(Set::size).sum(), pagesTouchedPerWiki.size()); } queue.addAll(candidates); @@ -156,26 +176,15 @@ private void synthesize(CandidateToBeSynthesized candidate) { candidate.getVoice() ); prevalence.execute( - new FlushSegmentPageVoice( + new CreateOrUpdatePageSegmentVoice( candidate.getWiki().getConsumerUrl(), candidate.getPage().getTitle(), candidate.getPageSegment().getHash(), + responseEnvelope.getRevision(), candidate.getVoice() )); } catch (Exception e) { - if (e instanceof WikispeechApi.MWException) { - WikispeechApi.MWException mwException = (WikispeechApi.MWException) e; - if (mwException.getError().get("error").hasNonNull("errorclass")) { - String errorClass = mwException.getError().get("error").get("errorclass").textValue(); - if ("MediaWiki\\\\Wikispeech\\\\Segment\\\\OutdatedOrInvalidRevisionException".equals(errorClass)) { - // send page back to segmentation - segmentService.queue(candidate.getWiki().getConsumerUrl(), candidate.getPage().getTitle()); - return; - } - } - } - log.error("Failed to synthesize voice '{}' for hash {} at segment index {} in page '{}' of wiki {} with priority {}", candidate.getVoice(), Base64.getEncoder().encodeToString(candidate.getPageSegment().getHash()), candidate.getPageSegment().getLowestIndexAtSegmentation(), candidate.getPage().getTitle(), candidate.getWiki().getConsumerUrl(), candidate.getPriority().getValue(), e); StringWriter stacktrace = new StringWriter(4096); stacktrace.append(e.getMessage()); @@ -233,6 +242,18 @@ public List query(Root root, Date da if (segmentVoice == null || segmentVoice.getTimestampSynthesized() == null || segmentVoice.getTimestampSynthesized().isAfter(resynthesizeTimestamp)) { + if (segmentVoice != null + && segmentVoice.getFailedAttempts() != null + && !segmentVoice.getFailedAttempts().isEmpty() + ) { + LocalDateTime mostRecentFailedAttempt = LocalDateTime.ofInstant(Instant.ofEpochMilli(0), ZoneOffset.UTC); + for (LocalDateTime failedAttempt : segmentVoice.getFailedAttempts().keySet()) + if (mostRecentFailedAttempt.isBefore(failedAttempt)) + mostRecentFailedAttempt = failedAttempt; + // attempt every n hours, where n=number of previous failures + if (mostRecentFailedAttempt.isAfter(LocalDateTime.now().minusHours(segmentVoice.getFailedAttempts().size()))) + continue; + } candidates.add(new SynthesizeService.CandidateToBeSynthesized( wiki, page, segment, segmentVoice, language, voice )); diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java index 320087e..41419ff 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java @@ -99,7 +99,7 @@ private void initializePrevalence() throws Exception { private static final Pattern journalFileNamePattern = Pattern.compile("(\\d+)\\.journal"); - @Scheduled(fixedDelay = 7, initialDelay = 1, timeUnit = TimeUnit.DAYS) + @Scheduled(fixedDelay = 7, initialDelay = 2, timeUnit = TimeUnit.DAYS) public void snapshotAndRemoveJournals() throws Exception { File snapshot = prevalyer.takeSnapshot(); Long currentJournal = null; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/Root.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/Root.java index bd33a1d..3ead0b3 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/Root.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/Root.java @@ -17,4 +17,11 @@ public class Root implements Serializable { private Intern internedVoices = new Intern<>(); private Intern internedLanguages = new Intern<>(); + @Override + public String toString() { + return "Root{" + + "internedVoices=" + internedVoices + + ", internedLanguages=" + internedLanguages + + '}'; + } } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Page.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Page.java index 8342614..7a28723 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Page.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Page.java @@ -22,4 +22,23 @@ public class Page implements Serializable { private Long revisionAtSegmentation; private String languageAtSegmentation; + /** + * Longevity of this page. + * E.g. main page should never be flushed out, + * pages linked from main page lives for quite some time (they are often large and have a lot of segments we don't want to rerender if we can avoid it) + * while most pages use system fallback (setting this value to null) + */ + private LocalDateTime timestampDontFlushUntil; + + + @Override + public String toString() { + return "Page{" + + "title='" + title + '\'' + + ", priority=" + priority + + ", timestampSegmented=" + timestampSegmented + + ", revisionAtSegmentation=" + revisionAtSegmentation + + ", languageAtSegmentation='" + languageAtSegmentation + '\'' + + '}'; + } } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java index f2b6ea6..9adc153 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/PageSegment.java @@ -4,6 +4,7 @@ import java.io.Serializable; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; @Data @@ -16,4 +17,11 @@ public class PageSegment implements Serializable { private int lowestIndexAtSegmentation; private List synthesizedVoices = new ArrayList<>(); + @Override + public String toString() { + return "PageSegment{" + + "hash=" + Arrays.toString(hash) + + ", lowestIndexAtSegmentation=" + lowestIndexAtSegmentation + + '}'; + } } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Wiki.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Wiki.java index ac33fa4..7a05758 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Wiki.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Wiki.java @@ -35,4 +35,17 @@ public boolean equals(Object o) { public int hashCode() { return Objects.hash(consumerUrl); } + + @Override + public String toString() { + return "Wiki{" + + "name='" + name + '\'' + + ", consumerUrl='" + consumerUrl + '\'' + + ", maximumSynthesizedVoiceAge=" + maximumSynthesizedVoiceAge + + ", defaultLanguage='" + defaultLanguage + '\'' + + ", voicesPerLanguage=" + voicesPerLanguage + + ", timestampOfLastRecentChangesItemProcessed=" + timestampOfLastRecentChangesItemProcessed + + ", pollRecentChangesNamespaces=" + pollRecentChangesNamespaces + + '}'; + } } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/GetPage.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/GetPage.java new file mode 100644 index 0000000..637075e --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/GetPage.java @@ -0,0 +1,28 @@ +package se.wikimedia.wikispeech.prerender.service.prevalence.query; + +import org.prevayler.Query; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; + +import java.util.Date; + +public class GetPage implements Query { + + private String consumerUrl; + private String title; + + public GetPage(String consumerUrl, String title) { + this.consumerUrl = consumerUrl; + this.title = title; + } + + @Override + public Page query(Root root, Date date) throws Exception { + Wiki wiki = root.getWikiByConsumerUrl().get(consumerUrl); + if (wiki == null) { + return null; + } + return wiki.getPagesByTitle().get(title); + } +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/PageSegmentExists.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/PageSegmentExists.java index 0463e6c..a45ef42 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/PageSegmentExists.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/PageSegmentExists.java @@ -2,7 +2,9 @@ import org.prevayler.Query; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegment; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; import java.util.Arrays; import java.util.Date; @@ -21,7 +23,10 @@ public PageSegmentExists(String consumerUrl, String title, byte[] hash) { @Override public Boolean query(Root root, Date date) throws Exception { - for (PageSegment segment : root.getWikiByConsumerUrl().get(consumerUrl).getPagesByTitle().get(title).getSegments()) { + Wiki wiki = root.getWikiByConsumerUrl().get(consumerUrl); + Page page = wiki.getPagesByTitle().get(title); + if (page == null) return false; + for (PageSegment segment : page.getSegments()) { if (Arrays.equals(hash, segment.getHash())) { return true; } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateNonSegmentedPage.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateNonSegmentedPage.java index 3a83439..c5880a7 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateNonSegmentedPage.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateNonSegmentedPage.java @@ -6,6 +6,7 @@ import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; +import java.time.LocalDateTime; import java.util.Date; @Data @@ -17,16 +18,18 @@ public class CreateNonSegmentedPage implements TransactionWithQuery private String title; private String languageAtSegmentation; private long revisionAtSegmentation; + private LocalDateTime timestampDontFlushUntil; private float priority = 1F; public CreateNonSegmentedPage() { } - public CreateNonSegmentedPage(String consumerUrl, String title, String languageAtSegmentation, long revisionAtSegmentation, float priority) { + public CreateNonSegmentedPage(String consumerUrl, String title, String languageAtSegmentation, long revisionAtSegmentation, LocalDateTime timestampDontFlushUntil, float priority) { this.consumerUrl = consumerUrl; this.title = title; this.languageAtSegmentation = languageAtSegmentation; this.revisionAtSegmentation = revisionAtSegmentation; + this.timestampDontFlushUntil = timestampDontFlushUntil; this.priority = priority; } @@ -40,6 +43,7 @@ public Page executeAndQuery(Root root, Date date) throws Exception { page.setTitle(title); page.setRevisionAtSegmentation(revisionAtSegmentation); page.setLanguageAtSegmentation(root.getInternedLanguages().intern(languageAtSegmentation)); + page.setTimestampDontFlushUntil(timestampDontFlushUntil); page.setPriority(priority); wiki.getPagesByTitle().put(title, page); return page; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateOrGetPage.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateOrGetPage.java new file mode 100644 index 0000000..263bb1e --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateOrGetPage.java @@ -0,0 +1,38 @@ +package se.wikimedia.wikispeech.prerender.service.prevalence.transaction; + +import lombok.Data; +import org.prevayler.TransactionWithQuery; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; + +import java.util.Date; + +@Data +public class CreateOrGetPage implements TransactionWithQuery { + + private static final long serialVersionUID = 1L; + + private String consumerUrl; + private String title; + + public CreateOrGetPage() { + } + + public CreateOrGetPage(String consumerUrl, String title) { + this.consumerUrl = consumerUrl; + this.title = title; + } + + @Override + public Page executeAndQuery(Root root, Date date) throws Exception { + Wiki wiki = root.getWikiByConsumerUrl().get(consumerUrl); + Page page = wiki.getPagesByTitle().get(title); + if (page != null) + return page; + page = new Page(); + page.setTitle(title); + wiki.getPagesByTitle().put(title, page); + return page; + } +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateOrUpdatePageSegmentVoice.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateOrUpdatePageSegmentVoice.java index 7c0ca84..863667e 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateOrUpdatePageSegmentVoice.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/CreateOrUpdatePageSegmentVoice.java @@ -25,7 +25,13 @@ public class CreateOrUpdatePageSegmentVoice implements TransactionWithQuery>> { + + private static final long serialVersionUID = 1L; + + private Duration pageMustBeThisOldToBeConsideredForFlushing; + private Duration flushPageAfterThisMuchTimeSinceSegmentation; + + public FindPagesToBeFlushed() { + } + + public FindPagesToBeFlushed( + Duration pageMustBeThisOldToBeConsideredForFlushing, + Duration flushPageAfterThisMuchTimeSinceSegmentation + ) { + this.pageMustBeThisOldToBeConsideredForFlushing = pageMustBeThisOldToBeConsideredForFlushing; + this.flushPageAfterThisMuchTimeSinceSegmentation = flushPageAfterThisMuchTimeSinceSegmentation; + } + + @Override + public Map> query(Root root, Date executionTime) throws Exception { + + LocalDateTime now = Prevalence.toLocalDateTime(executionTime); + + Map> pageTitlesByWikiConsumerUrl = new HashMap<>(); + + LocalDateTime ignoreIfPageWasSegmentedAfter = LocalDateTime.now().minus(pageMustBeThisOldToBeConsideredForFlushing); + LocalDateTime flushIfPageWasSegmentedBefore = LocalDateTime.now().minus(flushPageAfterThisMuchTimeSinceSegmentation); + + for (Wiki wiki : root.getWikiByConsumerUrl().values()) { + Set pageTitlesToBeFlushed = new HashSet<>(); + for (Page page : wiki.getPagesByTitle().values()) { + if (page.getTimestampSegmented() == null) { + // page has not been segmented + } else if (page.getTimestampDontFlushUntil() != null && page.getTimestampDontFlushUntil().isAfter(now)) { + // page is marked as not being flushed + } else if (page.getTimestampSegmented().isAfter(ignoreIfPageWasSegmentedAfter)) { + // page is too young. + // todo: the idea is to avoid looping back to re-segmenting the same page over and over, + // todo: especially if it's the main page or linked from main page. + // todo: there must be a better way to flush out other pages without touching the above mentioned. + } else if (page.getTimestampSegmented().isBefore(flushIfPageWasSegmentedBefore)) { + // segmentation is too old + pageTitlesToBeFlushed.add(page.getTitle()); + } else { + // check if all segments have been synthesized for all voices in wiki for the language of the page. + boolean allSegmentsHaveBeenSynthesizedForAllVoices = true; + for (PageSegment segment : page.getSegments()) { + Set voicesExpected = new HashSet<>(wiki.getVoicesPerLanguage().get(page.getLanguageAtSegmentation())); + if (segment.getSynthesizedVoices() != null) { + for (PageSegmentVoice voice : segment.getSynthesizedVoices()) { + if (voice.getTimestampSynthesized() != null) { + voicesExpected.remove(voice.getVoice()); + } + } + } + if (!voicesExpected.isEmpty()) { + allSegmentsHaveBeenSynthesizedForAllVoices = false; + break; + } + } + if (allSegmentsHaveBeenSynthesizedForAllVoices) { + pageTitlesToBeFlushed.add(page.getTitle()); + } + } + } + if (!pageTitlesToBeFlushed.isEmpty()) { + pageTitlesByWikiConsumerUrl.put(wiki.getConsumerUrl(), pageTitlesToBeFlushed); + } + } + return pageTitlesByWikiConsumerUrl; + } +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPageSegment.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPageSegment.java new file mode 100644 index 0000000..efc4cd7 --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPageSegment.java @@ -0,0 +1,46 @@ +package se.wikimedia.wikispeech.prerender.service.prevalence.transaction; + +import lombok.Data; +import org.prevayler.Transaction; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegment; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; + +import java.util.Arrays; +import java.util.Date; +import java.util.Iterator; + +@Data +public class FlushPageSegment implements Transaction { + + private static final long serialVersionUID = 1L; + + private String wikiConsumerUrl; + private String pageTitle; + private byte[] segmentHash; + + public FlushPageSegment() { + } + + public FlushPageSegment(String wikiConsumerUrl, String pageTitle, byte[] segmentHash) { + this.wikiConsumerUrl = wikiConsumerUrl; + this.pageTitle = pageTitle; + this.segmentHash = segmentHash; + } + + @Override + public void executeOn(Root root, Date executionTime) { + Wiki wiki = root.getWikiByConsumerUrl().get(wikiConsumerUrl); + Page page = wiki.getPagesByTitle().get(pageTitle); + + for (Iterator pageSegments = page.getSegments().iterator(); pageSegments.hasNext(); ) { + PageSegment pageSegment = pageSegments.next(); + if (Arrays.equals(segmentHash, pageSegment.getHash())) { + pageSegments.remove(); + break; + } + } + } + +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushSegmentPageVoice.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPageSegmentVoice.java similarity index 91% rename from src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushSegmentPageVoice.java rename to src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPageSegmentVoice.java index 152ba1d..e17c7a0 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushSegmentPageVoice.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPageSegmentVoice.java @@ -12,7 +12,7 @@ import java.util.Iterator; @Data -public class FlushSegmentPageVoice implements Transaction { +public class FlushPageSegmentVoice implements Transaction { private static final long serialVersionUID = 1L; @@ -21,10 +21,10 @@ public class FlushSegmentPageVoice implements Transaction { private byte[] segmentHash; private String voice; - public FlushSegmentPageVoice() { + public FlushPageSegmentVoice() { } - public FlushSegmentPageVoice(String wikiConsumerUrl, String pageTitle, byte[] segmentHash, String voice) { + public FlushPageSegmentVoice(String wikiConsumerUrl, String pageTitle, byte[] segmentHash, String voice) { this.wikiConsumerUrl = wikiConsumerUrl; this.pageTitle = pageTitle; this.segmentHash = segmentHash; diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPages.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPages.java index 7339892..eac340f 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPages.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/FlushPages.java @@ -3,57 +3,30 @@ import lombok.Data; import org.prevayler.Transaction; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegment; -import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.PageSegmentVoice; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; -import java.time.LocalDateTime; -import java.util.Date; -import java.util.Iterator; +import java.util.*; @Data public class FlushPages implements Transaction { private static final long serialVersionUID = 1L; - private LocalDateTime earliestAllowedTimestampSegmented; + private Map> pageTitlesByWikiConsumerUrl = new HashMap<>(); public FlushPages() { } - public FlushPages(LocalDateTime earliestAllowedTimestampSegmented) { - this.earliestAllowedTimestampSegmented = earliestAllowedTimestampSegmented; + public FlushPages(Map> pageTitlesByWikiConsumerUrl) { + this.pageTitlesByWikiConsumerUrl = pageTitlesByWikiConsumerUrl; } @Override public void executeOn(Root root, Date executionTime) { - for (Wiki wiki : root.getWikiByConsumerUrl().values()) { - for (Iterator pages = wiki.getPagesByTitle().values().iterator(); pages.hasNext(); ) { - Page page = pages.next(); - if (page.getTimestampSegmented() != null - && page.getTimestampSegmented().isBefore(earliestAllowedTimestampSegmented)) { - - boolean pageShouldBeRemoved = true; - if (page.getSegments() != null && !page.getSegments().isEmpty()) { - for (PageSegment segment : page.getSegments()) { - if (segment.getSynthesizedVoices() != null && !segment.getSynthesizedVoices().isEmpty()) { - for (PageSegmentVoice voice : segment.getSynthesizedVoices()) { - if (voice.getFailedAttempts() == null || voice.getFailedAttempts().isEmpty()) { - pageShouldBeRemoved = false; - break; - } - } - } - if (!pageShouldBeRemoved) break; - } - } - - if (pageShouldBeRemoved) - pages.remove(); - - } - + for (Map.Entry> pageTitlesByWikiConsumerUrl : this.pageTitlesByWikiConsumerUrl.entrySet()) { + Wiki wiki = root.getWikiByConsumerUrl().get(pageTitlesByWikiConsumerUrl.getKey()); + for (String pageTitle : pageTitlesByWikiConsumerUrl.getValue()) { + wiki.getPagesByTitle().remove(pageTitle); } } } diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetPageTimestampDontFlushUntil.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetPageTimestampDontFlushUntil.java new file mode 100644 index 0000000..48d288d --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetPageTimestampDontFlushUntil.java @@ -0,0 +1,37 @@ +package se.wikimedia.wikispeech.prerender.service.prevalence.transaction; + +import lombok.Data; +import org.prevayler.Transaction; +import org.prevayler.TransactionWithQuery; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; + +import java.time.LocalDateTime; +import java.util.Date; + +@Data +public class SetPageTimestampDontFlushUntil implements Transaction { + + private static final long serialVersionUID = 1L; + + private String consumerUrl; + private String title; + private LocalDateTime timestampDontFlushUntil; + + public SetPageTimestampDontFlushUntil() { + } + + public SetPageTimestampDontFlushUntil(String consumerUrl, String title, LocalDateTime timestampDontFlushUntil) { + this.consumerUrl = consumerUrl; + this.title = title; + this.timestampDontFlushUntil = timestampDontFlushUntil; + } + + @Override + public void executeOn(Root root, Date executionTime) { + Wiki wiki = root.getWikiByConsumerUrl().get(consumerUrl); + Page page = wiki.getPagesByTitle().get(title); + page.setTimestampDontFlushUntil(timestampDontFlushUntil); + } +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetWikiMainPage.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetWikiMainPage.java index 6866b11..e2757b3 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetWikiMainPage.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/transaction/SetWikiMainPage.java @@ -3,8 +3,12 @@ import lombok.Data; import org.prevayler.Transaction; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; +import java.time.Instant; +import java.time.LocalDateTime; +import java.time.ZoneOffset; import java.util.Date; @Data @@ -23,7 +27,9 @@ public SetWikiMainPage(String consumerUrl, String title) { @Override public void executeOn(Root root, Date date) { Wiki wiki = root.getWikiByConsumerUrl().get(consumerUrl); - wiki.setMainPage(wiki.getPagesByTitle().get(title)); + Page mainPage = wiki.getPagesByTitle().get(title); + mainPage.setTimestampDontFlushUntil(LocalDateTime.ofInstant(Instant.ofEpochMilli(Long.MAX_VALUE), ZoneOffset.UTC)); + wiki.setMainPage(mainPage); } } From 22d2bdcb17af6d858b41d59ac6462a94194f6bf1 Mon Sep 17 00:00:00 2001 From: kalle Date: Tue, 28 Mar 2023 11:17:19 +0200 Subject: [PATCH 5/8] Properties file --- README.md | 1 - .../prerender/WebAppConfiguration.java | 5 +- .../prerender/mediawiki/WikispeechApi.java | 15 +++-- .../prerender/rest/MainController.java | 12 ++++ .../service/MainPageLinksPrioritizer.java | 26 +++++--- .../prerender/service/Settings.java | 59 +++++++++++++++++++ .../prerender/service/SynthesizeService.java | 27 ++++++--- .../service/prevalence/Prevalence.java | 5 ++ src/main/resources/prerender.properties | 29 +++++++++ 9 files changed, 156 insertions(+), 23 deletions(-) create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/Settings.java create mode 100644 src/main/resources/prerender.properties diff --git a/README.md b/README.md index 253eb54..7beb3d1 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,6 @@ Will be a candidate to be retried every n hours, where n=number of previous fail * Make all hard coded value mentioned above configurable in a properties file or something. * Report state of candidate, flushing, etc to influxdb. - ## REST Most REST calls are exist for debug and development reasons. diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java b/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java index 616faf0..fdcbc9e 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/WebAppConfiguration.java @@ -14,6 +14,7 @@ import org.springframework.scheduling.annotation.EnableScheduling; import org.springframework.scheduling.annotation.SchedulingConfigurer; import org.springframework.scheduling.config.ScheduledTaskRegistrar; +import se.wikimedia.wikispeech.prerender.service.Settings; import java.io.IOException; import java.util.concurrent.Executor; @@ -27,7 +28,7 @@ public class WebAppConfiguration implements SchedulingConfigurer { @Bean - public OkHttpClient okHttpClient() { + public OkHttpClient okHttpClient(Settings settings) { return new OkHttpClient.Builder() .readTimeout(5, TimeUnit.MINUTES) .addInterceptor( @@ -39,7 +40,7 @@ public Response intercept(@NotNull Chain chain) throws IOException { Request requestWithUserAgent = originalRequest .newBuilder() .header("Content-Type", "application/json") - .header("User-Agent", "WMSE Wikispeech API Java client") + .header("User-Agent", settings.getString("WebAppConfiguration.userAgent", "WMSE Wikispeech Prerender")) .build(); return chain.proceed(requestWithUserAgent); diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java index da00e1b..bdcba16 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikispeechApi.java @@ -14,6 +14,7 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Component; import se.wikimedia.wikispeech.prerender.Collector; +import se.wikimedia.wikispeech.prerender.service.Settings; import java.io.ByteArrayOutputStream; import java.io.IOException; @@ -26,20 +27,26 @@ public class WikispeechApi { private final Logger log = LogManager.getLogger(getClass()); - private boolean skipJournalMetrics = true; - private String wikispeechBaseUrl = "https://wikispeech.wikimedia.se/w"; + private final boolean skipJournalMetrics; + private final String wikispeechBaseUrl; private final ObjectMapper objectMapper; private final OkHttpClient client; private final PageApi pageApi; + @Autowired public WikispeechApi( - @Autowired PageApi pageApi, - @Autowired OkHttpClient client + Settings settings, + PageApi pageApi, + OkHttpClient client ) { this.pageApi = pageApi; this.client = client; + + this.wikispeechBaseUrl = settings.getString("WikispeechApi.wikispeechBaseUrl", "https://wikispeech.wikimedia.se/w"); + this.skipJournalMetrics = settings.getBoolean("WikispeechApi.skipJournalMetrics", true); + objectMapper = new ObjectMapper() .registerModule(new JavaTimeModule()) .configure(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS, false); diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java b/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java index 30c58aa..b2fdead 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java @@ -263,4 +263,16 @@ public Page query(Root root, Date date) throws Exception { } })); } + + @RequestMapping( + method = RequestMethod.GET, + path = "prevalence/snapshot", + produces = "application/json" + ) + public ResponseEntity snapshot() throws Exception { + prevalence.takeSnapshot(); + Runtime.getRuntime().gc(); + Runtime.getRuntime().gc(); + return ResponseEntity.ok().build(); + } } \ No newline at end of file diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java index 5235051..05b265e 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java @@ -37,16 +37,28 @@ public class MainPageLinksPrioritizer { private final SegmentService segmentService; private final PageApi pageApi; + private final float priorityMultiplier; + private final Duration priorityTimeToLive; + private final Duration timestampDontFlushUntil; + + + @Autowired public MainPageLinksPrioritizer( - @Autowired Prevalence prevalence, - @Autowired PriorityService priorityService, - @Autowired SegmentService segmentService, - @Autowired PageApi pageApi + Prevalence prevalence, + PriorityService priorityService, + SegmentService segmentService, + PageApi pageApi, + Settings settings ) { this.prevalence = prevalence; this.priorityService = priorityService; this.segmentService = segmentService; this.pageApi = pageApi; + + priorityMultiplier = settings.getFloat("MainPageLinksPrioritizer.priorityMultiplier", 5f); + priorityTimeToLive = settings.getDuration("MainPageLinksPrioritizer.priorityTimeToLive", Duration.ofDays(1)); + timestampDontFlushUntil = settings.getDuration("MainPageLinksPrioritizer.timestampDontFlushUntil", Duration.ofDays(5)); + } private final Map lastChangedByWikiConsumerUrl = new HashMap<>(); @@ -56,8 +68,8 @@ public MainPageLinksPrioritizer( @Scheduled(fixedDelay = 1, timeUnit = TimeUnit.MINUTES, initialDelay = 0) public void run() throws Exception { LocalDateTime now = LocalDateTime.now(); - LocalDateTime future = now.plus(Duration.ofDays(1)); - LocalDateTime timestampDontFlushUntil = now.plusDays(5); + LocalDateTime priorityTimeToLive = now.plus(this.priorityTimeToLive); + LocalDateTime timestampDontFlushUntil = now.plus(this.timestampDontFlushUntil); for (Wiki wiki : prevalence.execute(new Query>() { @Override @@ -98,7 +110,7 @@ public boolean collect(String title) { priorityService.put( new ConsumerUrlAndTitle(wiki.getConsumerUrl(), title), new PriorityService.PagePrioritySetting( - now, future, 5F, + now, priorityTimeToLive, priorityMultiplier, wiki.getConsumerUrl(), title ) ); diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/Settings.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/Settings.java new file mode 100644 index 0000000..f17ac6b --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/Settings.java @@ -0,0 +1,59 @@ +package se.wikimedia.wikispeech.prerender.service; + +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.core.io.ResourceLoader; +import org.springframework.stereotype.Component; + +import java.io.IOException; +import java.time.Duration; +import java.util.Properties; + +@Component +public class Settings { + + private final Properties properties; + + @Autowired + public Settings(ResourceLoader resourceLoader) throws IOException { + Properties properties = new Properties(); + properties.load(resourceLoader.getResource("classpath:prerender.properties").getInputStream()); + this.properties = properties; + } + + public String getString(String key, String defaultValue) { + String value = (String)properties.get(key); + if (value == null) return defaultValue; + return value; + } + + public Boolean getBoolean(String key, Boolean defaultValue) { + String value = (String)properties.get(key); + if (value == null) return defaultValue; + return Boolean.parseBoolean(value); + } + + public Integer getInteger(String key, Integer defaultValue) { + String value = (String)properties.get(key); + if (value == null) return defaultValue; + return Integer.parseInt(value); + } + + public Float getFloat(String key, Float defaultValue) { + String value = (String)properties.get(key); + if (value == null) return defaultValue; + return Float.parseFloat(value); + } + + public Double getDouble(String key, Double defaultValue) { + String value = (String)properties.get(key); + if (value == null) return defaultValue; + return Double.parseDouble(value); + } + + public Duration getDuration(String key, Duration defaultValue) { + String value = (String)properties.get(key); + if (value == null) return defaultValue; + return Duration.parse(value); + } + +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java index e1432f8..8f8c9f9 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/SynthesizeService.java @@ -42,20 +42,21 @@ public class SynthesizeService extends AbstractLifecycle implements SmartLifecyc private final Prevalence prevalence; + private final Settings settings; + private final WikispeechApi wikispeechApi; private final PriorityService priorityService; - private final SegmentService segmentService; - /** * If gathered candidates is more than this number, * then cut candidates at end of queue and flush from prevalence. */ - private final int flushCandidatesThreshold = 100000; + private final int flushCandidatesThreshold; + private final Duration pageMustBeThisOldToBeConsideredForFlushing; + private final Duration flushPageAfterThisMuchTimeSinceSegmentation; - private final int numberOfWorkerThreads = 2; - private final int maximumNumberOfCandidates = 250; + private final int numberOfWorkerThreads; private ExecutorService workers; @@ -64,12 +65,17 @@ public SynthesizeService( Prevalence prevalence, WikispeechApi wikispeechApi, PriorityService priorityService, - SegmentService segmentService + Settings settings ) { this.prevalence = prevalence; this.wikispeechApi = wikispeechApi; this.priorityService = priorityService; - this.segmentService = segmentService; + this.settings = settings; + + flushCandidatesThreshold = settings.getInteger("SynthesizeService.flushCandidatesThreshold", 100000); + pageMustBeThisOldToBeConsideredForFlushing = settings.getDuration("SynthesizeService.pageMustBeThisOldToBeConsideredForFlushing", Duration.ofDays(1)); + flushPageAfterThisMuchTimeSinceSegmentation = settings.getDuration("SynthesizeService.flushPageAfterThisMuchTimeSinceSegmentation", Duration.ofDays(2)); + numberOfWorkerThreads = settings.getInteger("SynthesizeService.numberOfWorkerThreads", 1); workers = new ExecutorService() { @Override @@ -87,6 +93,7 @@ protected void execute() { } } }; + workers.setNumberOfWorkerThreads(numberOfWorkerThreads); } @@ -106,8 +113,10 @@ public void repopulateCandidates() { queue.clear(); try { - log.info("Searching for pages to be flushed... Must be at least one day old. All older than 2 days will be flushed."); - Map> pageTitlesByWikiConsumerUrl = prevalence.execute(new FindPagesToBeFlushed(Duration.ofDays(1), Duration.ofDays(2))); + log.info("Searching for pages to be flushed..."); + Map> pageTitlesByWikiConsumerUrl = prevalence.execute( + new FindPagesToBeFlushed(pageMustBeThisOldToBeConsideredForFlushing, flushPageAfterThisMuchTimeSinceSegmentation) + ); if (!pageTitlesByWikiConsumerUrl.isEmpty()) { prevalence.execute(new FlushPages(pageTitlesByWikiConsumerUrl)); log.info("Flushed pages {}", pageTitlesByWikiConsumerUrl); diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java index 41419ff..1827564 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/Prevalence.java @@ -11,6 +11,7 @@ import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; import java.io.File; +import java.io.IOException; import java.time.Instant; import java.time.LocalDateTime; import java.time.ZoneId; @@ -122,4 +123,8 @@ public void snapshotAndRemoveJournals() throws Exception { } } + public void takeSnapshot() throws Exception { + prevalyer.takeSnapshot(); + } + } diff --git a/src/main/resources/prerender.properties b/src/main/resources/prerender.properties new file mode 100644 index 0000000..926cfbc --- /dev/null +++ b/src/main/resources/prerender.properties @@ -0,0 +1,29 @@ +# Durations are formatted using https://docs.oracle.com/javase/8/docs/api/java/time/Duration.html#parse-java.lang.CharSequence- + +# User-Agent header used by all HTTP requests, i.e. to Wikispeech and remote MediaWikis. +WebAppConfiguration.userAgent = WMSE Wikispeech Prerender + +WikispeechApi.wikispeechBaseUrl = https://wikispeech.wikimedia.se/w + +# Set this to false if you want the Wikispeech server to log requests from the prerender. +WikispeechApi.skipJournalMetrics = true + +# Flush out segments in pages if voice candidates for synthesis grow larger than this number. +# As a rule of thumb, 10kB RAM will be consumed per candidate. E.g. 50000 = 500MB, 100000 1GB, and so on. +# The voices with the smallest priority will be flushed out. +SynthesizeService.flushCandidatesThreshold = 50000 +# Use this amount of parallel threads when synthesizing voices +SynthesizeService.numberOfWorkerThreads = 1 +# Don't flush pages from cache unless they where first seen this long ago (duration) +SynthesizeService.pageMustBeThisOldToBeConsideredForFlushing = P1D +# Consider flushing pages from cache after this much time (duration) since first seen, even if not all segment voices have been synthesized. +SynthesizeService.flushPageAfterThisMuchTimeSinceSegmentation = P2D + +# Multiply priority of all voices in all segments of pages linked from main page by this much +MainPageLinksPrioritizer.priorityMultiplier = 5 +# Keep priority multiplier for pages for this long (duration) after they were last seen on the main page +MainPageLinksPrioritizer.priorityTimeToLive = P1D +# Keep segments and voices for pages linked from main page in cache for this long (duration) before considering them for flushing. +MainPageLinksPrioritizer.timestampDontFlushUntil = P5D + + From d3a85c23b427c275bdd71de60a7df1dbd1688891 Mon Sep 17 00:00:00 2001 From: kalle Date: Tue, 28 Mar 2023 12:22:18 +0200 Subject: [PATCH 6/8] Initial --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..23c1459 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.idea +prevalence +target \ No newline at end of file From 27d2bf4280b033e656969f8b11bd7aff9e885d1c Mon Sep 17 00:00:00 2001 From: kalle Date: Wed, 29 Mar 2023 10:25:12 +0200 Subject: [PATCH 7/8] Poll metrics about most visited articles yesterday --- .../prerender/mediawiki/PageApi.java | 10 +- .../prerender/mediawiki/PageUtil.java | 10 ++ .../mediawiki/WikipediaMetricsApi.java | 91 ++++++++++ .../prerender/rest/MainController.java | 30 +++- .../service/MainPageLinksPrioritizer.java | 5 +- .../prerender/service/PriorityService.java | 30 ++-- ...YesterdaysMostReadArticlesPrioritizer.java | 155 ++++++++++++++++++ .../service/prevalence/domain/state/Wiki.java | 2 + src/main/resources/prerender.properties | 5 + 9 files changed, 310 insertions(+), 28 deletions(-) create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/PageUtil.java create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikipediaMetricsApi.java create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/YesterdaysMostReadArticlesPrioritizer.java diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/PageApi.java b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/PageApi.java index 4b15a7c..579f099 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/PageApi.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/PageApi.java @@ -18,6 +18,13 @@ @Component public class PageApi { + public static void main(String[] args) throws Exception { + PageApi api = new PageApi(new OkHttpClient()); + PageInfo mainpage = api.getPageInfo("https://sv.wikipedia.org/w", "Portal:Huvudsida"); + PageInfo quisling = api.getPageInfo("https://sv.wikipedia.org/w", "Vidkun_Quisling"); + System.currentTimeMillis(); + } + private final Logger log = LogManager.getLogger(getClass()); private final OkHttpClient client; @@ -74,8 +81,9 @@ public okhttp3.Headers getHttpHeaders(String consumerUrl, String title) throws I return response.headers(); } + @Autowired public PageApi( - @Autowired OkHttpClient client + OkHttpClient client ) { this.client = client; objectMapper = new ObjectMapper() diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/PageUtil.java b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/PageUtil.java new file mode 100644 index 0000000..8f4d982 --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/PageUtil.java @@ -0,0 +1,10 @@ +package se.wikimedia.wikispeech.prerender.mediawiki; + +public class PageUtil { + + public static String normalizeTitle(String title) { + title = title.replaceAll("_", " "); + return title; + } + +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikipediaMetricsApi.java b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikipediaMetricsApi.java new file mode 100644 index 0000000..66620a5 --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/mediawiki/WikipediaMetricsApi.java @@ -0,0 +1,91 @@ +package se.wikimedia.wikispeech.prerender.mediawiki; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.SerializationFeature; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import lombok.Data; +import okhttp3.*; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; + +import java.io.IOException; +import java.time.LocalDate; +import java.time.format.DateTimeFormatter; +import java.util.List; + +@Component +public class WikipediaMetricsApi { + + public static void main(String[] args) throws Exception { + WikipediaMetricsApi api = new WikipediaMetricsApi(new OkHttpClient()); + WikipediaMetricsApi.PageViews pageViews = api.getPageViewsTop("sv.wikipedia", LocalDate.parse("2023-03-14")); + System.currentTimeMillis(); + + } + + private final OkHttpClient client; + private final ObjectMapper objectMapper; + + @Autowired + public WikipediaMetricsApi( + OkHttpClient client + ) { + this.client = client; + objectMapper = new ObjectMapper() + .registerModule(new JavaTimeModule()) + .configure(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS, false); + + } + + private static final DateTimeFormatter pathSuffixDateFormatter = DateTimeFormatter.ofPattern("yyyy/MM/dd"); + + public PageViews getPageViewsTop(String wiki, LocalDate date) throws IOException { + // /sv.wikipedia/all-access/2023/03/27 + HttpUrl.Builder urlBuilder = HttpUrl.parse("https://wikimedia.org/api/rest_v1/metrics/pageviews/top/" + wiki + "/all-access/" + date.format(pathSuffixDateFormatter)).newBuilder(); + + Request request = new Request.Builder() + .url(urlBuilder.build()) + .build(); + + Call call = client.newCall(request); + Response response = call.execute(); + + JsonNode json; + try { + + if (response.code() == 404) + // occurs if there is no data for this date, e.g. a future date seen from the tz of the remote server + return null; + + if (response.code() != 200) { + throw new IOException("Response" + response); + } + + json = objectMapper.readTree(response.body().byteStream()); + } finally { + response.close(); + } + + return objectMapper.convertValue(json.get("items").get(0), PageViews.class); + } + + @Data + public static class PageViews { + private String project; + private String access; + private String year; + private String month; + private String day; + private List articles; + } + + @Data + public static class PageViewArticle { + private String article; + private int views; + private int rank; + + } + +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java b/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java index b2fdead..384eee6 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/rest/MainController.java @@ -17,10 +17,7 @@ import se.wikimedia.wikispeech.prerender.LocalCache; import se.wikimedia.wikispeech.prerender.mediawiki.PageApi; import se.wikimedia.wikispeech.prerender.mediawiki.WikispeechApi; -import se.wikimedia.wikispeech.prerender.service.CreateWikiCommand; -import se.wikimedia.wikispeech.prerender.service.PriorityService; -import se.wikimedia.wikispeech.prerender.service.SegmentService; -import se.wikimedia.wikispeech.prerender.service.SynthesizeService; +import se.wikimedia.wikispeech.prerender.service.*; import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; @@ -44,20 +41,25 @@ public class MainController { private final PageApi pageApi; private final SegmentService segmentService; private final SynthesizeService synthesizeService; + private final YesterdaysMostReadArticlesPrioritizer yesterdaysMostReadPrioritizer; private final ObjectMapper objectMapper; + @Autowired public MainController( - @Autowired Prevalence prevalence, - @Autowired SegmentService segmentService, - @Autowired PageApi pageApi, - @Autowired WikispeechApi wikispeechApi, - @Autowired SynthesizeService synthesizeService + Prevalence prevalence, + SegmentService segmentService, + PageApi pageApi, + WikispeechApi wikispeechApi, + SynthesizeService synthesizeService, + YesterdaysMostReadArticlesPrioritizer yesterdaysMostReadPrioritizer ) { this.prevalence = prevalence; this.pageApi = pageApi; this.wikispeechApi = wikispeechApi; this.segmentService = segmentService; this.synthesizeService = synthesizeService; + this.yesterdaysMostReadPrioritizer = yesterdaysMostReadPrioritizer; + objectMapper = new ObjectMapper() .registerModule(new JavaTimeModule()) .configure(SerializationFeature.WRITE_DATES_AS_TIMESTAMPS, false); @@ -275,4 +277,14 @@ public ResponseEntity snapshot() throws Exception { Runtime.getRuntime().gc(); return ResponseEntity.ok().build(); } + + @RequestMapping( + method = RequestMethod.GET, + path = "prioritizeYesterdaysMetrics", + produces = "application/json" + ) + public ResponseEntity prioritizeYesterdaysMetrics() throws Exception { + yesterdaysMostReadPrioritizer.process(); + return ResponseEntity.ok().build(); + } } \ No newline at end of file diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java index 05b265e..e2dbe1c 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/MainPageLinksPrioritizer.java @@ -27,7 +27,6 @@ import java.util.concurrent.TimeUnit; @Service -@EnableScheduling public class MainPageLinksPrioritizer { private final Logger log = LogManager.getLogger(getClass()); @@ -37,7 +36,7 @@ public class MainPageLinksPrioritizer { private final SegmentService segmentService; private final PageApi pageApi; - private final float priorityMultiplier; + private final double priorityMultiplier; private final Duration priorityTimeToLive; private final Duration timestampDontFlushUntil; @@ -55,7 +54,7 @@ public MainPageLinksPrioritizer( this.segmentService = segmentService; this.pageApi = pageApi; - priorityMultiplier = settings.getFloat("MainPageLinksPrioritizer.priorityMultiplier", 5f); + priorityMultiplier = settings.getDouble("MainPageLinksPrioritizer.priorityMultiplier", 5d); priorityTimeToLive = settings.getDuration("MainPageLinksPrioritizer.priorityTimeToLive", Duration.ofDays(1)); timestampDontFlushUntil = settings.getDuration("MainPageLinksPrioritizer.timestampDontFlushUntil", Duration.ofDays(5)); diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java index 292b686..c13c1b4 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/PriorityService.java @@ -101,7 +101,7 @@ public static class PagePrioritySetting extends PrioritySetting { public PagePrioritySetting() { } - public PagePrioritySetting(LocalDateTime from, LocalDateTime to, float multiplier, String consumerUrl, String title) { + public PagePrioritySetting(LocalDateTime from, LocalDateTime to, double multiplier, String consumerUrl, String title) { super(from, to, multiplier); this.consumerUrl = consumerUrl; this.title = title; @@ -140,20 +140,6 @@ public CalculatedPriority calculatePriority(SynthesizeService.CandidateToBeSynth priority.getExplanations().add(new Explanation(1D, "Starting value")); } - value *= candidateToBeSynthesized.getPage().getPriority(); - if (explain) { - priority.getExplanations().add(new Explanation(value, "multiplied with page priority " + candidateToBeSynthesized.getPage().getPriority())); - } - - if (candidateToBeSynthesized.getPageSegmentVoice() != null - && candidateToBeSynthesized.getPageSegmentVoice().getFailedAttempts() != null - && !candidateToBeSynthesized.getPageSegmentVoice().getFailedAttempts().isEmpty()) { - value /= candidateToBeSynthesized.getPageSegmentVoice().getFailedAttempts().size() + 1D; - if (explain) { - priority.getExplanations().add(new Explanation(value, "Divided with number of failures " + candidateToBeSynthesized.getPageSegmentVoice().getFailedAttempts().size())); - } - } - // lower segment index in page is slightly more prioritized value += 1D - Math.min(1000, candidateToBeSynthesized.getPageSegment().getLowestIndexAtSegmentation()) / 1000D; if (explain) { @@ -168,6 +154,20 @@ public CalculatedPriority calculatePriority(SynthesizeService.CandidateToBeSynth } } + value *= candidateToBeSynthesized.getPage().getPriority(); + if (explain) { + priority.getExplanations().add(new Explanation(value, "Multiplied with page priority " + candidateToBeSynthesized.getPage().getPriority())); + } + + if (candidateToBeSynthesized.getPageSegmentVoice() != null + && candidateToBeSynthesized.getPageSegmentVoice().getFailedAttempts() != null + && !candidateToBeSynthesized.getPageSegmentVoice().getFailedAttempts().isEmpty()) { + value /= candidateToBeSynthesized.getPageSegmentVoice().getFailedAttempts().size() + 1D; + if (explain) { + priority.getExplanations().add(new Explanation(value, "Divided with number of failures " + candidateToBeSynthesized.getPageSegmentVoice().getFailedAttempts().size())); + } + } + PrioritySetting multiplier = getMultiplier( candidateToBeSynthesized.getWiki(), candidateToBeSynthesized.getPage(), diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/YesterdaysMostReadArticlesPrioritizer.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/YesterdaysMostReadArticlesPrioritizer.java new file mode 100644 index 0000000..5cb6279 --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/YesterdaysMostReadArticlesPrioritizer.java @@ -0,0 +1,155 @@ +package se.wikimedia.wikispeech.prerender.service; + +import lombok.Data; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Component; +import se.wikimedia.wikispeech.prerender.mediawiki.PageApi; +import se.wikimedia.wikispeech.prerender.mediawiki.PageUtil; +import se.wikimedia.wikispeech.prerender.mediawiki.WikipediaMetricsApi; +import se.wikimedia.wikispeech.prerender.service.prevalence.Prevalence; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Page; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; +import se.wikimedia.wikispeech.prerender.service.prevalence.query.GetPage; +import se.wikimedia.wikispeech.prerender.service.prevalence.query.GetWikis; +import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.CreateNonSegmentedPage; +import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.CreateOrGetPage; +import se.wikimedia.wikispeech.prerender.service.prevalence.transaction.SetPageTimestampDontFlushUntil; + +import java.io.IOException; +import java.net.URL; +import java.time.Duration; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.TimeUnit; + +@Component +public class YesterdaysMostReadArticlesPrioritizer { + + private final Logger log = LogManager.getLogger(getClass()); + + private final Prevalence prevalence; + private final PriorityService priorityService; + private final SegmentService segmentService; + private final WikipediaMetricsApi wikipediaMetricsApi; + private final Settings settings; + private final PageApi pageApi; + + + @Autowired + public YesterdaysMostReadArticlesPrioritizer( + Settings settings, + Prevalence prevalence, + PriorityService priorityService, + SegmentService segmentService, + WikipediaMetricsApi wikipediaMetricsApi, + PageApi pageApi + ) { + this.settings = settings; + this.prevalence = prevalence; + this.priorityService = priorityService; + this.segmentService = segmentService; + this.wikipediaMetricsApi = wikipediaMetricsApi; + this.pageApi = pageApi; + } + + @Scheduled(fixedDelay = 30, initialDelay = 0, timeUnit = TimeUnit.MINUTES) + public void process() throws Exception { + // todo should be yesterday seen from the tz of the remote server + // todo check response header date: Tue, 28 Mar 2023 22:33:23 GMT + process(LocalDate.now().minusDays(1), + settings.getDouble("YesterdaysMostReadArticlesPrioritizer.topRankMultiplier", 4d), + settings.getDuration("YesterdaysMostReadArticlesPrioritizer.priorityTimeToLive", Duration.ofDays(1)), + settings.getDuration("YesterdaysMostReadArticlesPrioritizer.dontFlushPageUntil", Duration.ofDays(3)), + settings.getInteger("YesterdaysMostReadArticlesPrioritizer.maximumArticles", 200) + ); + } + + private final Map mostRecentlyProcessedMetricsPerWiki = new HashMap<>(); + + public void process( + LocalDate date, + double topRankMultiplier, + Duration priorityTimeToLive, + Duration dontFlushPageUntil, + int maximumArticles + ) throws Exception { + for (Wiki wiki : prevalence.execute(new GetWikis())) { + WikipediaMetricsApi.PageViews pageViews = getPageViews(wiki, date); + WikipediaMetricsApi.PageViews previousProcessedPageViews = mostRecentlyProcessedMetricsPerWiki.get(wiki); + if (pageViews != null && !pageViews.equals(previousProcessedPageViews)) { + process(wiki, pageViews, topRankMultiplier, priorityTimeToLive, dontFlushPageUntil, maximumArticles); + mostRecentlyProcessedMetricsPerWiki.put(wiki, pageViews); + } + } + } + + public void process( + Wiki wiki, + WikipediaMetricsApi.PageViews pageViews, + double topRankMultiplier, + Duration priorityTimeToLive, + Duration dontFlushPageUntil, + int maximumArticles + ) throws Exception { + + LocalDateTime now = LocalDateTime.now(); + LocalDateTime timestampPriorityTimeToLive = now.plus(priorityTimeToLive); + LocalDateTime timestampDontFlushPageUntil = now.plus(dontFlushPageUntil); + double multiplier = topRankMultiplier; + double multiplierRankSubtrahend = 1d / (double)maximumArticles; + int articlesCounter = 0; + for (WikipediaMetricsApi.PageViewArticle article : pageViews.getArticles()) { + + // the article name contains "_" instead of " ", etc, so we clean it up this way + // so that it's normalized with data retrieved from elsewhere in this project. + + PageApi.PageInfo pageInfo = pageApi.getPageInfo(wiki.getConsumerUrl(), article.getArticle()); + if (pageInfo == null) + // this is probably a special page. Search, recent changes, etc. + continue; + + String title = pageInfo.getTitle(); + + Page page = prevalence.execute(new GetPage(wiki.getConsumerUrl(), title)); + if (page == null) + page = prevalence.execute(new CreateOrGetPage(wiki.getConsumerUrl(), title)); + + if (page.getTimestampDontFlushUntil() == null || page.getTimestampDontFlushUntil().isBefore(timestampDontFlushPageUntil)) + prevalence.execute(new SetPageTimestampDontFlushUntil(wiki.getConsumerUrl(), title, timestampDontFlushPageUntil)); + + segmentService.queue(wiki.getConsumerUrl(), title); + + priorityService.put( + new ConsumerUrlAndTitle(wiki.getConsumerUrl(), title), + new PriorityService.PagePrioritySetting(now, timestampPriorityTimeToLive, multiplier, wiki.getConsumerUrl(), title)); + + if (articlesCounter++ >= maximumArticles) break; + multiplier -= multiplierRankSubtrahend; + } + } + + private WikipediaMetricsApi.PageViews getPageViews(Wiki wiki, LocalDate date) throws IOException { + String host = new URL(wiki.getConsumerUrl()).getHost(); + if (!host.endsWith(".wikipedia.org")) throw new UnsupportedOperationException("Wiki with consumerUrl " + wiki.getConsumerUrl() + "is not supported by Wikipedia metrics."); + String metricsWikiName = host.replaceAll("^([a-z]+\\.wikipedia)\\.org$", "$1"); + return wikipediaMetricsApi.getPageViewsTop(metricsWikiName, date); + } + + @Data + private static class ConsumerUrlAndTitle { + private String consumerUrl; + private String title; + + public ConsumerUrlAndTitle(String consumerUrl, String title) { + this.consumerUrl = consumerUrl; + this.title = title; + } + } + + +} diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Wiki.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Wiki.java index 7a05758..0eccd28 100644 --- a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Wiki.java +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/domain/state/Wiki.java @@ -13,6 +13,8 @@ public class Wiki implements Serializable { private static final long serialVersionUID = 1L; private String name; + + /** eg https://sv.wikipedia.org/w */ private String consumerUrl; private Page mainPage; diff --git a/src/main/resources/prerender.properties b/src/main/resources/prerender.properties index 926cfbc..2a1226b 100644 --- a/src/main/resources/prerender.properties +++ b/src/main/resources/prerender.properties @@ -26,4 +26,9 @@ MainPageLinksPrioritizer.priorityTimeToLive = P1D # Keep segments and voices for pages linked from main page in cache for this long (duration) before considering them for flushing. MainPageLinksPrioritizer.timestampDontFlushUntil = P5D +YesterdaysMostReadArticlesPrioritizer.topRankMultiplier = 4 +YesterdaysMostReadArticlesPrioritizer.priorityTimeToLive = P1D +YesterdaysMostReadArticlesPrioritizer.dontFlushPageUntil = P3D +# must not be greater than 1000 +YesterdaysMostReadArticlesPrioritizer.maximumArticles = 200 From 4b66131f48b330b6094fd5f98e2a1eb6f2ea0b09 Mon Sep 17 00:00:00 2001 From: kalle Date: Tue, 23 May 2023 12:36:05 +0200 Subject: [PATCH 8/8] gitignored file! --- .gitignore | 4 ++-- .../service/prevalence/query/GetWikis.java | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/GetWikis.java diff --git a/.gitignore b/.gitignore index 23c1459..5d3c895 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ .idea -prevalence -target \ No newline at end of file +/prevalence +/target \ No newline at end of file diff --git a/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/GetWikis.java b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/GetWikis.java new file mode 100644 index 0000000..bc65a5a --- /dev/null +++ b/src/main/java/se/wikimedia/wikispeech/prerender/service/prevalence/query/GetWikis.java @@ -0,0 +1,17 @@ +package se.wikimedia.wikispeech.prerender.service.prevalence.query; + +import org.prevayler.Query; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.Root; +import se.wikimedia.wikispeech.prerender.service.prevalence.domain.state.Wiki; + +import java.util.Collection; +import java.util.Date; +import java.util.HashSet; + +public class GetWikis implements Query> { + + @Override + public Collection query(Root root, Date executionTime) throws Exception { + return new HashSet(root.getWikiByConsumerUrl().values()); + } +}