-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathattach_url_anchor_to_markdown_head_list.py
46 lines (37 loc) · 1.86 KB
/
attach_url_anchor_to_markdown_head_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import re
heads = """
- What's Important When Scraping At Scale?
- Challenge #1 - Sloppy and Always Changing Website Formats
- No Easy Solution
- Challenge 2: Scalable Architecture
- Separate Product Discovery From Product Extraction
- Allocate More Resources To Product Extraction
- Challenge 3: Maintaining Throughput Performance
- Crawling Efficiency
- Challenge 4: Anti-Bot Countermeasures
- Proxies
- Beyond Proxies
- Challenge 5: Data Quality
- Wrapping Things Up
"""
# What's Important When Scraping At Scale? -> whats-important-when-scraping-at-scale
for head in re.split(r'\n', heads):
if head.strip():
start = re.search('\w', head).start()
prefix = head[:start]
head_ = head[start:]
fragment = re.sub(r'[^\w-]', '', head_.lower().replace(' ', '-'))
print("%s[%s](#%s)" % (prefix, head_, fragment))
# - [What's Important When Scraping At Scale?](#whats-important-when-scraping-at-scale)
# - [Challenge #1 - Sloppy and Always Changing Website Formats](#challenge-1---sloppy-and-always-changing-website-formats)
# - [No Easy Solution](#no-easy-solution)
# - [Challenge 2: Scalable Architecture](#challenge-2-scalable-architecture)
# - [Separate Product Discovery From Product Extraction](#separate-product-discovery-from-product-extraction)
# - [Allocate More Resources To Product Extraction](#allocate-more-resources-to-product-extraction)
# - [Challenge 3: Maintaining Throughput Performance](#challenge-3-maintaining-throughput-performance)
# - [Crawling Efficiency](#crawling-efficiency)
# - [Challenge 4: Anti-Bot Countermeasures](#challenge-4-anti-bot-countermeasures)
# - [Proxies](#proxies)
# - [Beyond Proxies](#beyond-proxies)
# - [Challenge 5: Data Quality](#challenge-5-data-quality)
# - [Wrapping Things Up](#wrapping-things-up)