Skip to content

Commit

Permalink
Made it possible to use --proxy-type when using check mode.
Browse files Browse the repository at this point in the history
  • Loading branch information
MPCodeWriter21 committed Jul 31, 2022
1 parent ea09958 commit 8bd53c4
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 27 deletions.
17 changes: 14 additions & 3 deletions ProxyEater/Proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,14 +245,25 @@ def check_all(self, timeout: int = 10, threads_no: int = 21, url: str = 'http://
else:
on_progress_callback = lambda proxy_list, progress: None

length = len(self)
finished: int = 0 # The number of proxies that have been checked.

def check_proxy(proxy_: Proxy):
"""
This function is used for checking the status of a proxy.
:param proxy_: The proxy to check.
:return:
"""
nonlocal finished
proxy_.check_status(timeout, url)
if (not proxy_.is_alive) and remove_dead:
self.remove(proxy_)
finished += 1
on_progress_callback(self, finished / length * 99.99)

threads = []
length = len(self)
for i, proxy in enumerate(self.copy()):
for proxy in self.copy():
thread = threading.Thread(target=check_proxy, args=(proxy,))
threads.append(thread)
thread.start()
Expand All @@ -262,8 +273,8 @@ def check_proxy(proxy_: Proxy):
threads.remove(thread)
break
time.sleep(0.1)
on_progress_callback(self, i / length * 100)

# Wait for all threads to finish
for thread in threads:
thread.join()

Expand Down
2 changes: 1 addition & 1 deletion ProxyEater/Scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@

from typing import Callable as _Callable

import requests # This module is used to send requests to the server.
import pandas # This module is used to parse the html table.
import requests # This module is used to send requests to the server.

from random_user_agent.user_agent import UserAgent # This module is used to generate random user agents.

Expand Down
2 changes: 1 addition & 1 deletion ProxyEater/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ProxyEater
# CodeWriter21

__version__ = "1.5.0"
__version__ = "1.5.1"
__author__ = "CodeWriter21"
__email__ = "[email protected]"
__license__ = "Apache-2.0"
Expand Down
39 changes: 24 additions & 15 deletions ProxyEater/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,6 @@ def scrape(args):
else:
proxy = None

proxy_types = []
# Parse the proxy type
if args.proxy_type:
proxy_types = [x.strip() for x in args.proxy_type.split(',')]
if not proxy_types:
proxy_types = ['http', 'https', 'socks4', 'socks5']
try:
proxy_types = [ProxyType.from_name(x) for x in proxy_types]
except ValueError as e:
logger.error(e)
return
logger.info(f'Using proxy types: {[proxy_type.name for proxy_type in proxy_types]}')

useragent = args.useragent

proxies = ProxyList()
Expand Down Expand Up @@ -86,7 +73,7 @@ def checking_callback(proxy_list: ProxyList, progress: float):
collected_proxies_count = proxies_.count
# Filter the proxies
logger.info('Filtering the proxies...')
proxies_ = proxies_.filter(type_=proxy_types)
proxies_ = proxies_.filter(type_=args.proxy_types)
if args.verbose:
logger.info(f'{scraper.name}: Removed {collected_proxies_count - proxies_.count} proxies of wrong type.')
collected_proxies_count = proxies_.count
Expand Down Expand Up @@ -154,6 +141,14 @@ def check(args):
logger.error(f'The source format {args.source_format} is not valid.')
return

if len(args.proxy_types) < 4:
loaded_proxies_count = proxies.count
# Filter the proxies
logger.info('Filtering the proxies...')
proxies = proxies.filter(type_=args.proxy_types)
if args.verbose:
logger.info(f'Removed {loaded_proxies_count - proxies.count} proxies of wrong type.')

logger.progress_bar = log21.ProgressBar(format_='Proxies: {count} {prefix}{bar}{suffix} {percentage}%', style='{',
additional_variables={'count': 0})

Expand Down Expand Up @@ -209,6 +204,7 @@ def main():
parser.add_argument('--format', '-f', help='The format for saving the proxies in text file(default:'
'"{scheme}://{ip}:{port}").',
default='{scheme}://{ip}:{port}')
parser.add_argument('--proxy-type', '-type', help=f'The type of the proxies(default:all).', default='')
parser.add_argument('--include-status', '-is', help=f'Include the status of the proxies in the output file.',
action='store_true')
parser.add_argument('--threads', '-t', help=f'The number of threads to use for scraping(default:25).', type=int,
Expand All @@ -222,7 +218,6 @@ def main():
version='%(prog)s ' + ProxyEater.__version__)
scrap_arguments = parser.add_argument_group('Scrape', 'Scrape mode arguments')
scrap_arguments.add_argument('--proxy', '-p', help=f'The proxy to use for scraping.')
scrap_arguments.add_argument('--proxy-type', '-type', help=f'The type of the proxies(default:all).', default='')
scrap_arguments.add_argument('--useragent', '-ua', help=f'The useragent of the requests(default:random).')
scrap_arguments.add_argument('--include-geolocation', '-ig',
help=f'Include the geolocation info of the proxies in the output file.',
Expand Down Expand Up @@ -273,6 +268,20 @@ def main():
args.output = pathlib.Path('.') / f'proxies-{i}.{ext}'
i += 1

proxy_types = []
# Parse the proxy type
if args.proxy_type:
proxy_types = [x.strip() for x in args.proxy_type.split(',')]
if not proxy_types:
proxy_types = ['http', 'https', 'socks4', 'socks5']
try:
proxy_types = [ProxyType.from_name(x) for x in proxy_types]
except ValueError as e:
logger.error(e)
return
logger.info(f'Using proxy types: {[proxy_type.name for proxy_type in proxy_types]}')
args.proxy_types = proxy_types

args.mode = args.mode.lower()
if args.mode == 'scrape':
scrape(args)
Expand Down
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ProxyEater\[1.5.0\]
ProxyEater\[1.5.1\]
===================

![version](https://img.shields.io/pypi/v/ProxyEater)
Expand Down Expand Up @@ -35,9 +35,9 @@ Usage

```
usage: ProxyEater [-h] [--source SOURCE] [--output OUTPUT] [--file-format { text, json, csv }]
[--format FORMAT] [--include-status] [--threads THREADS] [--timeout TIMEOUT]
[--url URL] [--verbose] [--quiet] [--version] [--proxy PROXY] [--proxy-type
PROXY_TYPE] [--useragent USERAGENT] [--include-geolocation] [--no-check]
[--format FORMAT] [--proxy-type PROXY_TYPE] [--include-status] [--threads
THREADS] [--timeout TIMEOUT] [--url URL] [--verbose] [--quiet] [--version]
[--proxy PROXY] [--useragent USERAGENT] [--include-geolocation] [--no-check]
[--source-format { text, json, csv }] [--default-type { http, https, socks4,
socks5 }]
mode
Expand All @@ -58,6 +58,8 @@ options:
--format FORMAT, -f FORMAT
The format for saving the proxies in text
file(default:"{scheme}://{ip}:{port}").
--proxy-type PROXY_TYPE, -type PROXY_TYPE
The type of the proxies(default:all).
--include-status, -is
Include the status of the proxies in the output file.
--threads THREADS, -t THREADS
Expand All @@ -78,8 +80,6 @@ Scrape:
--proxy PROXY, -p PROXY
The proxy to use for scraping.
--proxy-type PROXY_TYPE, -type PROXY_TYPE
The type of the proxies(default:all).
--useragent USERAGENT, -ua USERAGENT
The useragent of the requests(default:random).
--include-geolocation, -ig
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

setup(
name='ProxyEater',
version='1.5.0',
version='1.5.1',
author='CodeWriter21',
author_email='[email protected]',
description='A Python Proxy Scraper for gathering fresh proxies.',
Expand Down

0 comments on commit 8bd53c4

Please sign in to comment.