-
Notifications
You must be signed in to change notification settings - Fork 81
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #697 from flairNLP/add-se
Add `BE`
- Loading branch information
Showing
7 changed files
with
372 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from fundus.publishers.base_objects import Publisher, PublisherGroup | ||
from fundus.publishers.be.nieuwsblad import NieuwsbladParser | ||
from fundus.scraping.url import RSSFeed, Sitemap | ||
|
||
|
||
class BE(metaclass=PublisherGroup): | ||
Nieuwsblad = Publisher( | ||
name="Nieuwsblad", | ||
domain="https://www.nieuwsblad.be/", | ||
parser=NieuwsbladParser, | ||
sources=[ | ||
RSSFeed("https://www.nieuwsblad.be/rss/section/55178e67-15a8-4ddd-a3d8-bfe5708f8932"), | ||
RSSFeed("https://www.nieuwsblad.be/rss/section/7f1bc231-66e7-49f0-a126-b7346eb3e2fa"), | ||
RSSFeed("https://www.nieuwsblad.be/rss/section/3dfcee99-2971-4c4c-a603-8c41ae86398b"), | ||
RSSFeed("https://www.nieuwsblad.be/rss/section/c0c3b215-10be-4f82-86d6-8b8584a5639d"), | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
import datetime | ||
import re | ||
from typing import List, Optional | ||
|
||
from lxml.etree import XPath | ||
|
||
from fundus.parser import ArticleBody, BaseParser, Image, ParserProxy, attribute | ||
from fundus.parser.utility import ( | ||
extract_article_body_with_selector, | ||
generic_author_parsing, | ||
generic_date_parsing, | ||
generic_topic_parsing, | ||
image_extraction, | ||
strip_nodes_to_text, | ||
) | ||
|
||
|
||
class NieuwsbladParser(ParserProxy): | ||
class V1(BaseParser): | ||
_summary_selector = XPath("//div[@data-testid='article-intro']") | ||
_paragraph_selector = XPath("//div[@data-testid='article-body']/p[text()]") | ||
_subheadline_selector = XPath( | ||
"//div[@data-testid='article-body']/p/span[@class='bold'] | " "//div[@data-testid='article-body']/h3" | ||
) | ||
|
||
_topic_selector = XPath("//ul[contains(@class, 'taglist')]/li") | ||
|
||
@attribute | ||
def body(self) -> Optional[ArticleBody]: | ||
return extract_article_body_with_selector( | ||
self.precomputed.doc, | ||
paragraph_selector=self._paragraph_selector, | ||
subheadline_selector=self._subheadline_selector, | ||
summary_selector=self._summary_selector, | ||
) | ||
|
||
@attribute | ||
def authors(self) -> List[str]: | ||
return generic_author_parsing(self.precomputed.ld.bf_search("author")) | ||
|
||
@attribute | ||
def publishing_date(self) -> Optional[datetime.datetime]: | ||
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) | ||
|
||
@attribute | ||
def title(self) -> Optional[str]: | ||
return self.precomputed.ld.bf_search("headline") | ||
|
||
@attribute | ||
def topics(self) -> List[str]: | ||
topic_string = strip_nodes_to_text(self._topic_selector(self.precomputed.doc), join_on=",") | ||
if topic_string is not None: | ||
return generic_topic_parsing(topic_string, delimiter=",") | ||
return [] | ||
|
||
@attribute | ||
def images(self) -> List[Image]: | ||
return image_extraction( | ||
doc=self.precomputed.doc, | ||
paragraph_selector=self._paragraph_selector, | ||
author_selector=re.compile(r"\s*—?\s*©\s*(?P<credits>.*)"), | ||
lower_boundary_selector=XPath("//div[@class='widget partnerbox_1']"), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,252 @@ | ||
{ | ||
"V1": { | ||
"authors": [ | ||
"nieuwsblad.be" | ||
], | ||
"body": { | ||
"summary": [ | ||
"Annelies Verlinden (46) maakt voor CD&V opnieuw haar opwachting in de federale regering, dit keer als minister van Justitie. De ex-zakenadvocate werd Vivaldi binnengeloodst door toenmalig partijvoorzitter Joachim Coens en groeide na een goed persoonlijk resultaat op 9 juni 2024 uit tot een sterkhouder voor de christendemocraten." | ||
], | ||
"sections": [ | ||
{ | ||
"headline": [], | ||
"paragraphs": [ | ||
"Annelies Verlinden wordt in 1978 geboren in het Antwerpse Merksem en studeert in 2001 af van de KULeuven met een master in de rechten. Daarna haalt ze nog een diploma Europees recht aan de UCL.", | ||
"Na haar studies gaat ze aan de slag bij het internationale advocatenkantoor DLA Piper. In 2014 wordt ze er vennoot. Verlinden heeft een expertise in administratief recht, milieurecht en publiek recht. Ze treedt onder meer op als advocate in de Arco-rechtszaak bij de Raad van State.", | ||
"In oktober 2020 tovert CD&V-voorzitter Joachim Coens haar uit de hoed als minister van Binnenlandse Zaken en Institutionele Hervormingen en Democratische Vernieuwing. Een grote verrassing in de Wetstraat, al is haar politieke engagement niet nieuw. Verlinden is tussen 2005 en 2009 al ondervoorzitter van Jong CD&V en is in haar thuisbasis Schoten tussen 2003 en 2012 gemeenteraadslid." | ||
] | ||
}, | ||
{ | ||
"headline": [ | ||
"Coronapandemie" | ||
], | ||
"paragraphs": [ | ||
"Begin vorig jaar onthult ze in haar boek Eerlijk gezegd dat Joachim Coens al jaren een belangrijke rol speelt in haar politiek engagement. Ze leerden elkaar kennen toen zij hem als rechtenstudent interviewde voor een paper en hielden sindsdien contact. In 2020 consulteerde Coens haar geregeld tijdens de regeringsonderhandelingen en vroeg hij haar uiteindelijk om minister te worden. De beslissing nam ze naar eigen zeggen “in vijf minuten”.", | ||
"Verlinden wordt de eerste jaren net als de andere Vivaldi-ministers volledig opgeslokt door de coronapandemie. Als minister van Binnenlandse Zaken is ze verantwoordelijk voor de ministeriële besluiten met coronaregels die na afloop van elk Overlegcomité moeten worden opgesteld. Dat werk mondt uiteindelijk uit in een pandemiewet, die de politiek een stevigere juridische basis biedt voor de verregaande maatregelen." | ||
] | ||
}, | ||
{ | ||
"headline": [ | ||
"Overstromingen" | ||
], | ||
"paragraphs": [ | ||
"Ook naast de coronapandemie blijft Verlinden niet gespaard van crisissen. De overstromingen in Wallonië in de zomer van 2021 grijpen haar zwaar aan, net als de moord op de Brusselse agent Thomas Monjoie en de aanslag op Zweedse voetbalsupporters in oktober 2023. (Tekst gaat verder onder de foto.)", | ||
"Vivaldi houdt er gedurende de legislatuur een gespannen relatie met de politievakbonden op na. Verlinden zorgt er wel voor dat agenten voor het eerst in 20 jaar een loonsverhoging bovenop de index krijgen, maar die moet na overleg in de regering over meerdere jaren worden gespreid, tot onvrede van de bonden. Dat er een uitdoofscenario voor de voordelige pensioenregeling voor politieagenten (NAVAP) aan gekoppeld wordt, wordt evenmin op applaus onthaald.", | ||
"Samen met MR-vicepremier David Clarinval is Verlinden bevoegd voor de voorbereiding van de institutionele toekomst van het land, maar de brede volksbevraging die het tweetal lanceert draait uit op een sisser." | ||
] | ||
}, | ||
{ | ||
"headline": [ | ||
"Chatbot" | ||
], | ||
"paragraphs": [ | ||
"Met de pers heeft de CD&V-politica het moeilijk, geeft ze zelf aan in Eerlijk gezegd. Ze wordt afgeschilderd als “ijskoningin of “chatbot”, beelden waarin ze zich helemaal niet herkent. Om die reden gaat ze spaarzaam om met details over haar privéleven en probeert ze beheerst te communiceren, zonder polariserend taalgebruik. “Ik voel geen behoefte om populair te doen”, zegt ze daar zelf over.", | ||
"Die houding levert haar electoraal geen windeieren op. In juni 2024 trekt Verlinden de Kamerlijst voor CD&V in Antwerpen, een moeilijke provincie voor de christendemocraten. Maar de minister haalt er meer dan 65.000 voorkeurstemmen, het op vier na sterkste resultaat nationaal. Haar status in de partij vaart er wel bij, en Verlinden staat voor CD&V op het voorplan tijdens de federale regeringsonderhandelingen. Dat ze net als vicepremier Vincent Van Peteghem in Arizona opnieuw een rol zou spelen, is dus geen grote verrassing." | ||
] | ||
} | ||
] | ||
}, | ||
"images": [ | ||
{ | ||
"versions": [ | ||
{ | ||
"url": "https://img.nieuwsblad.be/RNzXDeKWDb7yW-vhpgeCexODVdI=/120x80/smart/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2F9c36b38b-2b0b-4738-a73c-f9bc53802d8a.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 120, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/e8q79eW964LYHhAx5Olld4b7hj8=/160x107/smart/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2F9c36b38b-2b0b-4738-a73c-f9bc53802d8a.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 160, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/ZiPOlKsdjOaIS9tILwNsjdNfhm4=/320x213/smart/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2F9c36b38b-2b0b-4738-a73c-f9bc53802d8a.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 320, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/SAlfS1wSy7jPzE9ldorNA6_f-7o=/640x427/smart/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2F9c36b38b-2b0b-4738-a73c-f9bc53802d8a.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 640, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/LW6KV9O1fhZ-GpU6a7unxVmih4I=/960x640/smart/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2F9c36b38b-2b0b-4738-a73c-f9bc53802d8a.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 960, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/G3jq-9FCKs1FA03rjBaiffCgA9s=/1280x853/smart/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2F9c36b38b-2b0b-4738-a73c-f9bc53802d8a.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 1280, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
} | ||
], | ||
"is_cover": true, | ||
"description": null, | ||
"caption": null, | ||
"authors": [ | ||
"BART DEWAELE" | ||
], | ||
"position": 1725 | ||
}, | ||
{ | ||
"versions": [ | ||
{ | ||
"url": "https://img.nieuwsblad.be/tQtQyQc2RV3ZmlDThNU4ZGbZV3c=/fit-in/120x80/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fea5be229-5670-428f-a8df-6bd2ff23870f.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 120, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/kLiUYeLuCxjkUzdOdClVl7dmQZo=/fit-in/160x107/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fea5be229-5670-428f-a8df-6bd2ff23870f.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 160, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/E_sDEyXStA2Ly0BrBM_RbeEF2Uo=/fit-in/320x213/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fea5be229-5670-428f-a8df-6bd2ff23870f.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 320, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/boIa2QBP6bmWMSX_B4YzPpstYbs=/fit-in/640x427/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fea5be229-5670-428f-a8df-6bd2ff23870f.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 640, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/spZI5HATI1of-_VLGMmTGaEH3so=/fit-in/960x640/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fea5be229-5670-428f-a8df-6bd2ff23870f.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 960, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/rrV7t3Jf-3UnbiLfH_eSt-mRYWY=/fit-in/1280x853/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fea5be229-5670-428f-a8df-6bd2ff23870f.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 1280, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
} | ||
], | ||
"is_cover": false, | ||
"description": "Verlinden tijdens de coronacrisis.", | ||
"caption": "Verlinden tijdens de coronacrisis.", | ||
"authors": [ | ||
"BELGA" | ||
], | ||
"position": 1769 | ||
}, | ||
{ | ||
"versions": [ | ||
{ | ||
"url": "https://img.nieuwsblad.be/1GHF6TvniEdPP504KVGyPxPuOe4=/fit-in/120x80/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fa0b254d3-e314-4b05-a29f-d1f35d50ebf7.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 120, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/lI1xY0yCSMHbXaSWYUFnUAwJFVM=/fit-in/160x107/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fa0b254d3-e314-4b05-a29f-d1f35d50ebf7.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 160, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/APU3Kcr7FL5sP9i6rsOTRaqRGlE=/fit-in/320x213/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fa0b254d3-e314-4b05-a29f-d1f35d50ebf7.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 320, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/x9bUGCUqFJ1L1eshPXYlTCV2x2w=/fit-in/640x427/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fa0b254d3-e314-4b05-a29f-d1f35d50ebf7.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 640, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/u6uoXuNK2Uxx3CAnjLRuoPwxsqM=/fit-in/960x640/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fa0b254d3-e314-4b05-a29f-d1f35d50ebf7.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 960, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
}, | ||
{ | ||
"url": "https://img.nieuwsblad.be/EqxuDrxfr_sQ3PofZpLC8YfgKcM=/fit-in/1280x853/https%3A%2F%2Fstatic.nieuwsblad.be%2FAssets%2FImages_Upload%2F2025%2F02%2F02%2Fa0b254d3-e314-4b05-a29f-d1f35d50ebf7.jpg", | ||
"query_width": null, | ||
"size": { | ||
"width": 1280, | ||
"height": 0 | ||
}, | ||
"type": "image/jpeg" | ||
} | ||
], | ||
"is_cover": false, | ||
"description": "Tijdens een herdenking voor de slachtoffers van de waterbom van juli 2021 in de provincie Luik.", | ||
"caption": "Tijdens een herdenking voor de slachtoffers van de waterbom van juli 2021 in de provincie Luik.", | ||
"authors": [ | ||
"BELGA" | ||
], | ||
"position": 1784 | ||
} | ||
], | ||
"publishing_date": "2025-02-02 20:07:46+01:00", | ||
"title": "BIO. Annelies Verlinden ruilt Binnenlandse Zaken in voor Justitie", | ||
"topics": [ | ||
"Annelies Verlinden" | ||
] | ||
} | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"Nieuwsblad_2025_02_02.html.gz": { | ||
"url": "https://www.nieuwsblad.be/cnt/dmf20250202_96887132", | ||
"crawl_date": "2025-02-02 23:02:23.326767" | ||
} | ||
} |