Skip to content

Commit

Permalink
chore: update version to 0.1.3, adjust Python version requirement, an…
Browse files Browse the repository at this point in the history
…d add GitHub trending example script
  • Loading branch information
ancs21 committed Jan 9, 2025
1 parent 58d4ae8 commit a60cd8b
Show file tree
Hide file tree
Showing 5 changed files with 445 additions and 5 deletions.
205 changes: 205 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,211 @@ print(f"Estimated cost: ${cost:.6f}")
- Built-in token counting using tiktoken
- Clean and minimal output

## Examples

- [GitHub Trending](examples/github-trending.py)

```
# Prerequisites
# ----
# This script is used to scrape the GitHub Trending page and extract the data into a JSON object.
# !uv pip install fast-html2md hrequests google-genai python-dotenv
# create .env file with GOOGLE_API_KEY=your_api_key
# uv run python examples/github-trending.py
# ----
# Run the script
uv run python examples/github-trending.py
# Result (JSON / 10 Jan 2025)
{
"repositories": [
{
"description": "Your AI second brain. Self-hostable. Get answers from the web or your docs. Build custom agents, schedule automations, do deep research. Turn any online or local LLM into your personal, autonomous AI (gpt, claude, gemini, llama, qwen, mistral). Get started - free.",
"forks": 1214,
"href": "khoj-ai / khoj",
"name": "khoj",
"owner": "khoj-ai",
"rank": 1,
"repository_name": "khoj-ai / khoj",
"stars": 23173,
"built_by": [],
"language": "Python",
"todays_stars": 1508
},
{
"description": "Resume Matcher is an open source, free tool to improve your resume. It works by using AI, Reader LLMs, to compare and rank resumes with job descriptions.",
"forks": 2528,
"href": "srbhr / Resume-Matcher",
"name": "Resume-Matcher",
"owner": "srbhr",
"rank": 2,
"repository_name": "srbhr / Resume-Matcher",
"stars": 6814,
"built_by": [],
"language": "Python",
"todays_stars": 436
},
{
"description": "How to run an Ink Node",
"forks": 232,
"href": "inkonchain / node",
"name": "node",
"owner": "inkonchain",
"rank": 3,
"repository_name": "inkonchain / node",
"stars": 17265,
"built_by": [],
"language": "Shell",
"todays_stars": 3303
},
{
"description": "VILA is a family of state-of-the-art vision language models (VLMs) for diverse multimodal AI tasks across the edge, data center, and cloud.",
"forks": 198,
"href": "NVlabs / VILA",
"name": "VILA",
"owner": "NVlabs",
"rank": 4,
"repository_name": "NVlabs / VILA",
"stars": 2498,
"built_by": [],
"language": "Python",
"todays_stars": 60
},
{
"description": "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper",
"forks": 1691,
"href": "unclecode / crawl4ai",
"name": "crawl4ai",
"owner": "unclecode",
"rank": 5,
"repository_name": "unclecode / crawl4ai",
"stars": 23533,
"built_by": [],
"language": "Python",
"todays_stars": 1106
},
{
"description": "Ink Documentation",
"forks": 172,
"href": "inkonchain / docs",
"name": "docs",
"owner": "inkonchain",
"rank": 6,
"repository_name": "inkonchain / docs",
"stars": 17200,
"built_by": [],
"language": "MDX",
"todays_stars": 3301
},
{
"description": "GoogleTest - Google Testing and Mocking Framework",
"forks": 10223,
"href": "google / googletest",
"name": "googletest",
"owner": "google",
"rank": 7,
"repository_name": "google / googletest",
"stars": 35180,
"built_by": [],
"language": "C++",
"todays_stars": 10
},
{
"description": "Build your own AI friend",
"forks": 338,
"href": "78 / xiaozhi-esp32",
"name": "xiaozhi-esp32",
"owner": "78",
"rank": 8,
"repository_name": "78 / xiaozhi-esp32",
"stars": 1973,
"built_by": [],
"language": "C",
"todays_stars": 437
},
{
"description": "Firebase SDK for Apple App Development",
"forks": 1504,
"href": "firebase / firebase-ios-sdk",
"name": "firebase-ios-sdk",
"owner": "firebase",
"rank": 9,
"repository_name": "firebase / firebase-ios-sdk",
"stars": 5820,
"built_by": [],
"language": "C++",
"todays_stars": 38
},
{
"description": "PyTorch native post-training library",
"forks": 476,
"href": "pytorch / torchtune",
"name": "torchtune",
"owner": "pytorch",
"rank": 10,
"repository_name": "pytorch / torchtune",
"stars": 4601,
"built_by": [],
"language": "Python",
"todays_stars": 8
},
{
"description": "Fast C++ logging library.",
"forks": 4636,
"href": "gabime / spdlog",
"name": "spdlog",
"owner": "gabime",
"rank": 11,
"repository_name": "gabime / spdlog",
"stars": 24964,
"built_by": [],
"language": "C++",
"todays_stars": 15
},
{
"description": "Apache Thrift",
"forks": 4037,
"href": "apache / thrift",
"name": "thrift",
"owner": "apache",
"rank": 12,
"repository_name": "apache / thrift",
"stars": 10508,
"built_by": [],
"language": "C++",
"todays_stars": 12
},
{
"description": "Autonomous coding agent right in your IDE, capable of creating/editing files, executing commands, using the browser, and more with your permission every step of the way.",
"forks": 1762,
"href": "cline / cline",
"name": "cline",
"owner": "cline",
"rank": 13,
"repository_name": "cline / cline",
"stars": 20784,
"built_by": [],
"language": "TypeScript",
"todays_stars": 345
},
{
"description": "A BNB Smart Chain client based on the go-ethereum fork",
"forks": 1587,
"href": "bnb-chain / bsc",
"name": "bsc",
"owner": "bnb-chain",
"rank": 14,
"repository_name": "bnb-chain / bsc",
"stars": 2833,
"built_by": [],
"language": "Go",
"todays_stars": 13
}
]
}
```

## License

This project is licensed under the MIT License - see the [LICENSE](https://github.com/ancs21/fast-html2md/blob/main/LICENSE) file for details.
Expand Down
88 changes: 88 additions & 0 deletions examples/github-trending.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# ----
# This script is used to scrape the GitHub Trending page and extract the data into a JSON object.
# !uv pip install fast-html2md hrequests google-genai python-dotenv
# create .env file with GOOGLE_API_KEY=your_api_key
# uv run python examples/github-trending.py
# ----

import os
import hrequests
from fast_html2md import HTMLToMarkdown
from google import genai
from google.genai import types
from dotenv import load_dotenv

load_dotenv()

client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))


def main():
url = "https://github.com/trending"
response = hrequests.get(url)
html = response.text
converter = HTMLToMarkdown()
markdown = converter.convert(html)
# print(markdown)
response = client.models.generate_content(
model="gemini-2.0-flash-exp",
contents=f"""
Extract the following Markdown into a JSON object:
{markdown}
""",
config=types.GenerateContentConfig(
response_mime_type="application/json",
response_schema={
"type": "object",
"properties": {
"repositories": {
"type": "array",
"items": {
"type": "object",
"properties": {
"rank": {"type": "integer"},
"name": {"type": "string"},
"owner": {"type": "string"},
"repository_name": {"type": "string"},
"description": {"type": "string"},
"language": {"type": "string"},
"stars": {"type": "integer"},
"forks": {"type": "integer"},
"todays_stars": {"type": "number"},
"built_by": {
"type": "array",
"items": {
"type": "object",
"properties": {
"username": {"type": "string"},
"avatar_url": {"type": "string"},
"href": {"type": "string"},
},
"required": ["username", "href"],
},
},
"href": {"type": "string"},
},
"required": [
"rank",
"name",
"owner",
"repository_name",
"description",
"stars",
"forks",
"href",
],
},
},
},
"required": ["repositories"],
"description": "Schema for the JSON output of scraping GitHub Trending page.",
},
),
)
print(response.text)


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "hatchling.build"

[project]
name = "fast-html2md"
version = "0.1.2"
version = "0.1.3"
description = "Convert HTML to Markdown for LLM input extraction"
readme = "README.md"
requires-python = ">=3.10"
Expand Down
2 changes: 1 addition & 1 deletion src/fast_html2md/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from markdownify import markdownify as md
from functools import lru_cache

from src.fast_html2md.models import ModelInfo
from .models import ModelInfo


class HTMLCleanerPipeline:
Expand Down
Loading

0 comments on commit a60cd8b

Please sign in to comment.