Skip to content

Commit

Permalink
refacto
Browse files Browse the repository at this point in the history
  • Loading branch information
jpontoire committed Jan 15, 2025
1 parent de29187 commit 25e2e60
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 64 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ ftest/*.csv
*.sqlar
*-wal
*-shm
*.csv

/crawl
/downloaded
Expand Down
93 changes: 30 additions & 63 deletions minet/reddit/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ def get_comments(self, url: str, all):
yield data

def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
fn = data_posts if type == "subreddit" else data_user_posts
n_crawled = 0
old_url = get_old_url(get_url_from_subreddit(url))
while old_url and (limit is None or n_crawled < limit):
Expand Down Expand Up @@ -328,71 +329,37 @@ def get_general_post(self, url: str, type: str, add_text: bool, limit: int):
post_url, self.pool_manager
)
if text_error:
if type == "subreddit":
yield data_posts(
post,
title,
post_url,
"",
upvote,
n_comments_scraped,
n_comments,
published_date,
edited_date,
link,
text_error,
)
else:
yield data_user_posts(
post,
title,
post_url,
"",
upvote,
n_comments_scraped,
n_comments,
published_date,
edited_date,
link,
text_error,
)
try_content = text_soup.select_one("div#siteTable div.usertext")
if try_content:
content = try_content.get_text()
else:
content = ""
else:
content = ""
if type == "subreddit":
post = data_posts(
post,
title,
post_url,
content,
upvote,
n_comments_scraped,
n_comments,
published_date,
edited_date,
link,
error,
yield fn(
post,
title,
post_url,
None,
upvote,
n_comments_scraped,
n_comments,
published_date,
edited_date,
link,
text_error,
)
content = text_soup.scrape_one(
"div#siteTable div.usertext-body"
)
else:
post = data_user_posts(
post,
title,
post_url,
content,
upvote,
n_comments_scraped,
n_comments,
published_date,
edited_date,
link,
error,
)

yield post
content = ""
yield fn(
post,
title,
post_url,
content,
upvote,
n_comments_scraped,
n_comments,
published_date,
edited_date,
link,
error,
)
n_crawled += 1
old_url = soup.scrape_one("span.next-button a", "href")

Expand Down

0 comments on commit 25e2e60

Please sign in to comment.