Skip to content

Commit

Permalink
Draft of edited_date
Browse files Browse the repository at this point in the history
  • Loading branch information
jpontoire committed Jan 8, 2025
1 parent d770363 commit 240f1f2
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 7 deletions.
27 changes: 20 additions & 7 deletions minet/reddit/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,12 @@ def get_points(ele):
return scrapped_points


def get_dates(ele):
published_date = ele.scrape_one("time[class='']", "datetime")
edited_date = ele.scrape_one("time[class='edited-timestamp']", "datetime")
return published_date, edited_date


def data_posts(
post,
title,
Expand All @@ -95,6 +101,7 @@ def data_posts(
scraped_number_comments,
number_comments,
published_date,
edited_date,
link,
error,
):
Expand All @@ -109,6 +116,7 @@ def data_posts(
scraped_number_comments=scraped_number_comments,
number_comments=number_comments,
published_date=published_date,
edited_date=edited_date,
external_link=link,
error=error,
)
Expand All @@ -124,6 +132,7 @@ def data_user_posts(
scraped_number_comments,
number_comments,
published_date,
edited_date,
link,
error,
):
Expand All @@ -136,6 +145,7 @@ def data_user_posts(
scraped_number_comments=scraped_number_comments,
number_comments=number_comments,
published_date=published_date,
edited_date=edited_date,
external_link=link,
subreddit=sub,
error=error,
Expand Down Expand Up @@ -198,7 +208,7 @@ def get_comments(self, url: str, all):
try_author = com.scrape_one("a[class^='author']")
author = try_author if try_author else "Deleted"
points = get_points(com)
published_date = com.scrape_one("time", "datetime")
published_date, edited_date = get_dates(com)
if "morerecursion" in com.get("class") and all:
url_rec = f"https://old.reddit.com{com.scrape_one('a', 'href')}"
m_comments = self.get_childs_l500(url_rec, m_comments, parent)
Expand Down Expand Up @@ -243,6 +253,7 @@ def get_comments(self, url: str, all):
parent=parent,
points=points,
published_date=published_date,
edited_date=edited_date,
comment=com.scrape_one("div[class='md']:not(div.child a)"),
error=error,
)
Expand Down Expand Up @@ -276,11 +287,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
else:
n_comments = 0
upvote = get_points(post)
# upvote = post.select_one("div[class='score unvoted']").get_text()
# real_points = "" if upvote == "•" else upvote
# if real_points[-1] == "k":
# real_points = int(float(real_points[:-1]) * 1000)
published_date = post.scrape_one("time", "datetime")
published_date, edited_date = get_dates(post)
link = resolve_relative_url(
post.scrape_one("a[class*='title']", "href")
)
Expand All @@ -301,6 +308,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
n_comments_scraped,
n_comments,
published_date,
edited_date,
link,
text_error,
)
Expand All @@ -314,6 +322,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
n_comments_scraped,
n_comments,
published_date,
edited_date,
link,
text_error,
)
Expand All @@ -336,6 +345,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
n_comments_scraped,
n_comments,
published_date,
edited_date,
link,
error,
)
Expand All @@ -349,6 +359,7 @@ def get_general_post(self, url: str, type: str, add_text: bool, nb=25):
n_comments_scraped,
n_comments,
published_date,
edited_date,
link,
error,
)
Expand All @@ -372,6 +383,7 @@ def get_user_comments(self, url: str, nb=25):
post_subreddit="",
points="",
published_date="",
edited_date="",
text="",
comment_url="",
error=error,
Expand All @@ -389,7 +401,7 @@ def get_user_comments(self, url: str, nb=25):
)
post_subreddit = comment.scrape_one("a[class^='subreddit']", "href")
points = get_points(comment)
published_date = comment.scrape_one("time", "datetime")
published_date, edited_date = get_dates(comment)
text = comment.scrape_one("div[class='content'] div[class='md']")
comment_url = comment.scrape_one("a[class='bylink']", "href")
data = RedditUserComment(
Expand All @@ -398,6 +410,7 @@ def get_user_comments(self, url: str, nb=25):
post_subreddit=post_subreddit,
points=points,
published_date=published_date,
edited_date=edited_date,
text=text,
comment_url=comment_url,
error=error,
Expand Down
4 changes: 4 additions & 0 deletions minet/reddit/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class RedditPost(TabularRecord):
scraped_number_comments: str
number_comments: int
published_date: str
edited_date: str
external_link: Optional[str]
error: str

Expand All @@ -26,6 +27,7 @@ class RedditComment(TabularRecord):
parent: str
points: str
published_date: str
edited_date: str
comment: str
error: str

Expand All @@ -39,6 +41,7 @@ class RedditUserPost(TabularRecord):
scraped_number_comments: str
number_comments: int
published_date: str
edited_date: str
external_link: str
subreddit: str
error: str
Expand All @@ -51,6 +54,7 @@ class RedditUserComment(TabularRecord):
post_subreddit: str
points: int
published_date: str
edited_date: str
text: str
comment_url: str
error: str

0 comments on commit 240f1f2

Please sign in to comment.