From aebd9c26aea1224c2f292cc6e70dfac52af1265f Mon Sep 17 00:00:00 2001 From: Andrei P Date: Tue, 4 Oct 2022 19:23:12 +0300 Subject: [PATCH] fix itemprop containing articleBody If itemprop is not exactly == "articleBody" the node was "cleaned" for instance itemprop="description articleBody" would be cleaned. Blogspot / Blogger for instance uses this itemprop --- newspaper/cleaners.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newspaper/cleaners.py b/newspaper/cleaners.py index 47b6f1a89..610156f31 100644 --- a/newspaper/cleaners.py +++ b/newspaper/cleaners.py @@ -47,7 +47,7 @@ def __init__(self, config): .create("\n", "\n\n")\ .append("\t")\ .append("^\\s+$") - self.contains_article = './/article|.//*[@id="article"]|.//*[@itemprop="articleBody"]' + self.contains_article = './/article|.//*[@id="article"]|.//*[contains(@itemprop,"articleBody")]' def clean(self, doc_to_clean): """Remove chunks of the DOM as specified