chore: update version to 0.1.3, adjust Python version requirement, an…

…d add GitHub trending example script
ancs21 · Jan 9, 2025 · a60cd8b · a60cd8b
1 parent 58d4ae8
commit a60cd8b
Show file tree

Hide file tree

Showing 5 changed files with 445 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -60,6 +60,211 @@ print(f"Estimated cost: ${cost:.6f}")
 - Built-in token counting using tiktoken
 - Clean and minimal output
 
+## Examples
+
+- [GitHub Trending](examples/github-trending.py)
+
+```
+# Prerequisites
+# ----
+# This script is used to scrape the GitHub Trending page and extract the data into a JSON object.
+# !uv pip install fast-html2md hrequests google-genai python-dotenv
+# create .env file with GOOGLE_API_KEY=your_api_key
+# uv run python examples/github-trending.py
+# ----
+
+# Run the script
+uv run python examples/github-trending.py
+
+# Result (JSON / 10 Jan 2025)
+{
+  "repositories": [
+    {
+      "description": "Your AI second brain. Self-hostable. Get answers from the web or your docs. Build custom agents, schedule automations, do deep research. Turn any online or local LLM into your personal, autonomous AI (gpt, claude, gemini, llama, qwen, mistral). Get started - free.",
+      "forks": 1214,
+      "href": "khoj-ai /  khoj",
+      "name": "khoj",
+      "owner": "khoj-ai",
+      "rank": 1,
+      "repository_name": "khoj-ai /  khoj",
+      "stars": 23173,
+     "built_by": [],
+      "language": "Python",
+      "todays_stars": 1508
+    },
+    {
+      "description": "Resume Matcher is an open source, free tool to improve your resume. It works by using AI, Reader LLMs, to compare and rank resumes with job descriptions.",
+      "forks": 2528,
+      "href": "srbhr /  Resume-Matcher",
+      "name": "Resume-Matcher",
+      "owner": "srbhr",
+      "rank": 2,
+      "repository_name": "srbhr /  Resume-Matcher",
+      "stars": 6814,
+    "built_by": [],
+      "language": "Python",
+      "todays_stars": 436
+    },
+    {
+       "description": "How to run an Ink Node",
+      "forks": 232,
+      "href": "inkonchain /  node",
+      "name": "node",
+      "owner": "inkonchain",
+      "rank": 3,
+      "repository_name": "inkonchain /  node",
+      "stars": 17265,
+     "built_by": [],
+     "language": "Shell",
+      "todays_stars": 3303
+    },
+    {
+     "description": "VILA is a family of state-of-the-art vision language models (VLMs) for diverse multimodal AI tasks across the edge, data center, and cloud.",
+      "forks": 198,
+      "href": "NVlabs /  VILA",
+       "name": "VILA",
+      "owner": "NVlabs",
+      "rank": 4,
+      "repository_name": "NVlabs /  VILA",
+      "stars": 2498,
+    "built_by": [],
+      "language": "Python",
+      "todays_stars": 60
+    },
+    {
+      "description": "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper",
+      "forks": 1691,
+      "href": "unclecode /  crawl4ai",
+      "name": "crawl4ai",
+      "owner": "unclecode",
+      "rank": 5,
+      "repository_name": "unclecode /  crawl4ai",
+      "stars": 23533,
+    "built_by": [],
+      "language": "Python",
+      "todays_stars": 1106
+    },
+    {
+      "description": "Ink Documentation",
+      "forks": 172,
+      "href": "inkonchain /  docs",
+       "name": "docs",
+      "owner": "inkonchain",
+      "rank": 6,
+      "repository_name": "inkonchain /  docs",
+      "stars": 17200,
+     "built_by": [],
+     "language": "MDX",
+      "todays_stars": 3301
+    },
+    {
+     "description": "GoogleTest - Google Testing and Mocking Framework",
+      "forks": 10223,
+      "href": "google /  googletest",
+       "name": "googletest",
+      "owner": "google",
+      "rank": 7,
+      "repository_name": "google /  googletest",
+      "stars": 35180,
+    "built_by": [],
+      "language": "C++",
+      "todays_stars": 10
+    },
+        {
+      "description": "Build your own AI friend",
+       "forks": 338,
+      "href": "78 /  xiaozhi-esp32",
+      "name": "xiaozhi-esp32",
+      "owner": "78",
+      "rank": 8,
+      "repository_name": "78 /  xiaozhi-esp32",
+      "stars": 1973,
+     "built_by": [],
+     "language": "C",
+      "todays_stars": 437
+    },
+    {
+     "description": "Firebase SDK for Apple App Development",
+      "forks": 1504,
+      "href": "firebase /  firebase-ios-sdk",
+      "name": "firebase-ios-sdk",
+      "owner": "firebase",
+      "rank": 9,
+      "repository_name": "firebase /  firebase-ios-sdk",
+      "stars": 5820,
+    "built_by": [],
+      "language": "C++",
+      "todays_stars": 38
+    },
+    {
+     "description": "PyTorch native post-training library",
+      "forks": 476,
+      "href": "pytorch /  torchtune",
+      "name": "torchtune",
+       "owner": "pytorch",
+       "rank": 10,
+      "repository_name": "pytorch /  torchtune",
+      "stars": 4601,
+     "built_by": [],
+      "language": "Python",
+      "todays_stars": 8
+    },
+    {
+       "description": "Fast C++ logging library.",
+      "forks": 4636,
+      "href": "gabime /  spdlog",
+      "name": "spdlog",
+      "owner": "gabime",
+      "rank": 11,
+      "repository_name": "gabime /  spdlog",
+      "stars": 24964,
+   "built_by": [],
+      "language": "C++",
+      "todays_stars": 15
+    },
+    {
+      "description": "Apache Thrift",
+      "forks": 4037,
+      "href": "apache /  thrift",
+       "name": "thrift",
+      "owner": "apache",
+      "rank": 12,
+      "repository_name": "apache /  thrift",
+      "stars": 10508,
+    "built_by": [],
+      "language": "C++",
+      "todays_stars": 12
+    },
+    {
+      "description": "Autonomous coding agent right in your IDE, capable of creating/editing files, executing commands, using the browser, and more with your permission every step of the way.",
+      "forks": 1762,
+      "href": "cline /  cline",
+      "name": "cline",
+       "owner": "cline",
+      "rank": 13,
+      "repository_name": "cline /  cline",
+      "stars": 20784,
+    "built_by": [],
+      "language": "TypeScript",
+      "todays_stars": 345
+    },
+    {
+      "description": "A BNB Smart Chain client based on the go-ethereum fork",
+      "forks": 1587,
+      "href": "bnb-chain /  bsc",
+      "name": "bsc",
+       "owner": "bnb-chain",
+      "rank": 14,
+      "repository_name": "bnb-chain /  bsc",
+      "stars": 2833,
+   "built_by": [],
+      "language": "Go",
+      "todays_stars": 13
+    }
+  ]
+}
+```
+
 ## License
 
 This project is licensed under the MIT License - see the [LICENSE](https://github.com/ancs21/fast-html2md/blob/main/LICENSE) file for details.

diff --git a/examples/github-trending.py b/examples/github-trending.py
@@ -0,0 +1,88 @@
+# ----
+# This script is used to scrape the GitHub Trending page and extract the data into a JSON object.
+# !uv pip install fast-html2md hrequests google-genai python-dotenv
+# create .env file with GOOGLE_API_KEY=your_api_key
+# uv run python examples/github-trending.py
+# ----
+
+import os
+import hrequests
+from fast_html2md import HTMLToMarkdown
+from google import genai
+from google.genai import types
+from dotenv import load_dotenv
+
+load_dotenv()
+
+client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
+
+
+def main():
+    url = "https://github.com/trending"
+    response = hrequests.get(url)
+    html = response.text
+    converter = HTMLToMarkdown()
+    markdown = converter.convert(html)
+    # print(markdown)
+    response = client.models.generate_content(
+        model="gemini-2.0-flash-exp",
+        contents=f"""
+    Extract the following Markdown into a JSON object:
+    {markdown}
+    """,
+        config=types.GenerateContentConfig(
+            response_mime_type="application/json",
+            response_schema={
+                "type": "object",
+                "properties": {
+                    "repositories": {
+                        "type": "array",
+                        "items": {
+                            "type": "object",
+                            "properties": {
+                                "rank": {"type": "integer"},
+                                "name": {"type": "string"},
+                                "owner": {"type": "string"},
+                                "repository_name": {"type": "string"},
+                                "description": {"type": "string"},
+                                "language": {"type": "string"},
+                                "stars": {"type": "integer"},
+                                "forks": {"type": "integer"},
+                                "todays_stars": {"type": "number"},
+                                "built_by": {
+                                    "type": "array",
+                                    "items": {
+                                        "type": "object",
+                                        "properties": {
+                                            "username": {"type": "string"},
+                                            "avatar_url": {"type": "string"},
+                                            "href": {"type": "string"},
+                                        },
+                                        "required": ["username", "href"],
+                                    },
+                                },
+                                "href": {"type": "string"},
+                            },
+                            "required": [
+                                "rank",
+                                "name",
+                                "owner",
+                                "repository_name",
+                                "description",
+                                "stars",
+                                "forks",
+                                "href",
+                            ],
+                        },
+                    },
+                },
+                "required": ["repositories"],
+                "description": "Schema for the JSON output of scraping GitHub Trending page.",
+            },
+        ),
+    )
+    print(response.text)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "fast-html2md"
-version = "0.1.2"
+version = "0.1.3"
 description = "Convert HTML to Markdown for LLM input extraction"
 readme = "README.md"
 requires-python = ">=3.10"

diff --git a/src/fast_html2md/cleaners.py b/src/fast_html2md/cleaners.py
@@ -4,7 +4,7 @@
 from markdownify import markdownify as md
 from functools import lru_cache
 
-from src.fast_html2md.models import ModelInfo
+from .models import ModelInfo
 
 
 class HTMLCleanerPipeline: