Proxied Web Scraping Python Code title: Contents style: nestedList # TOC style (nestedList|inlineFirstLevel) minLevel: 1 # Include headings from the specified level maxLevel: 4 # Include headings up to the specified level includeLinks: true # Make headings clickable debugInConsole: false # Print debug info in Obsidian console Overview Sources: LuminaProxy/lumina.py at main · AnonCatalyst/LuminaProxy Code import os import requests from bs4 import BeautifulSoup from concurrent.futures import ThreadPoolExecutor from tqdm import tqdm from tabulate import tabulate class ProxyScraper: def __init__(self, proxy_websites, target_ports): self.proxy_websites = proxy_websites self.target_ports = set(target_ports) self.valid_proxies = [] self.invalid_proxies = [] self.successful_sites = 0 # Track the number of sites successfully scraped self.failed_sites = 0 # Track the number of sites failed to scrape self.errors = 0 def scrape_proxies(self, url): try: print(f"🌐 Scraping proxies from: {url}") response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') scraped_proxies_count = 0 # Track the number of proxies scraped from this website for row in soup.find_all('tr')[1:]: cells = row.find_all(['td', 'th']) if len(cells) >= 2: try: ip, port = map(lambda cell: cell.text.strip(), cells[:2]) if self.is_valid_ip(ip) and self.is_valid_port(port): proxy_info = f"{ip}:{port}" if int(port) in self.target_ports: self.valid_proxies.append(proxy_info) scraped_proxies_count += 1 else: self.invalid_proxies.append(proxy_info) else: self.invalid_proxies.append(f"❌ Invalid Proxy: {ip}:{port}") except ValueError as ve: print(f"❌ Error parsing proxy info: {ve}") if scraped_proxies_count > 0: print(f"✅ Successfully scraped \033[1m{scraped_proxies_count}\033[0m proxies from {url}") self.successful_sites += 1 else: print(f"ℹ️ No proxies scraped from {url}") self.failed_sites += 1 except requests.exceptions.RequestException as re: print(f"❌ Error scraping proxies from {url}: {re}") self.errors += 1 self.failed_sites += 1 def scrape_proxies_threaded(self): with ThreadPoolExecutor(max_workers=10) as executor: futures = [executor.submit(self.scrape_proxies, website) for website in self.proxy_websites] for future in tqdm(futures, total=len(self.proxy_websites), desc="🌐 Scraping Proxies"): future.result() def is_valid_ip(self, ip): try: parts = ip.split(".") return len(parts) == 4 and all(0 <= int(part) < 256 for part in parts) except ValueError: return False def is_valid_port(self, port): try: port = int(port) return 1 <= port <= 65535 except ValueError: return False def clear_screen(): os.system('cls' if os.name == 'nt' else 'clear') def print_welcome(): clear_screen() print("\033[1m\033[92mWelcome to \033[1m\033[95mLuminaProxy\033[0m - Your \033[1m\033[96mBright Proxy Scraper\033[0m\n") print("\033[1m🌟 Description:\033[0m LuminaProxy is a powerful tool crafted by AnonCatalyst to streamline the process of gathering proxy information from various online sources.") print("Whether you're conducting research, enhancing security, or testing network configurations, LuminaProxy empowers you to effortlessly collect and categorize proxies.") print("Its threaded and efficient design ensures a swift scraping experience, providing you with a clear summary of valid proxies, invalid entries, and any encountered errors.") print("\033[1m👨💻 Developer:\033[0m AnonCatalyst\n\033[1m🔗 GitHub:\033[0m [AnonCatalyst on GitHub](https://github.com/AnonCatalyst)") print("\033[1m📸 Instagram:\033[0m [@istoleyourbutter](https://www.instagram.com/istoleyourbutter/)") def main(): proxy_websites = [ 'https://www.sslproxies.org/', 'https://free-proxy-list.net/', 'https://www.us-proxy.org/', 'https://www.proxy-list.download/HTTP', 'https://www.proxy-list.download/HTTPS', 'https://spys.one/en/socks-proxy-list/', 'https://www.socks-proxy.net/', 'https://hidemy.name/en/proxy-list/', 'https://www.proxy-list.org/en/', 'https://www.proxyserverlist24.top/', 'https://www.proxy-list.net/proxy-server-list/', 'https://www.proxy-daily.com/', 'https://www.proxynova.com/proxy-server-list/', 'https://www.proxy-list.biz/', 'https://www.proxy-list.net/anonymous-proxy-lists.shtml', 'https://www.proxy-list.net/socks5-proxy-lists.shtml', 'https://www.my-proxy.com/free-proxy-list.html', 'https://www.proxy-list.site/', 'https://www.webshare.io/', 'https://www.proxyscrape.com/free-proxy-list', 'https://free-proxy-list.net/uk-proxy.html', 'https://www.proxynova.com/proxy-server-list/country-us/', 'https://www.sslproxies.org/socks-proxy-list/', 'https://free-proxy-list.net/anonymous-proxy.html', 'https://www.proxynova.com/proxy-server-list/country-br/', 'https://www.proxynova.com/proxy-server-list/country-cn/', 'https://www.sslproxies.org/high-anonymous-proxy/', 'https://www.proxynova.com/proxy-server-list/country-ru/', 'https://www.proxygather.com/', 'https://www.proxy-listen.de/azenv.php', 'https://www.proxyscrape.com/free-proxy-list', 'https://www.freeproxylists.net/', 'https://proxy-list.org/english/index.php', 'https://www.proxy-list.org/', 'https://www.proxyscrape.com/', 'https://www.xroxy.com/proxylist.htm', 'https://www.proxy-list.net/', 'https://www.proxy4free.com/', 'https://www.proxybazaar.com/', 'https://www.proxz.com/', 'https://www.proxyrack.com/', 'https://www.proxy-list.download/', 'https://proxylist.me/', 'https://proxylist.hidemyass.com/', 'https://www.proxyscrape.com/api-proxylist/', 'https://www.proxy-listen.de/azenv.php', 'https://www.us-proxy.org/', 'https://www.sslproxies.org/', 'https://free-proxy-list.net/', 'https://www.proxynova.com/proxy-server-list/country-fr/', 'https://www.proxynova.com/proxy-server-list/country-de/', # Add more proxy websites here ] target_ports = {1080, 8000, 8001, 8002, 1082, 80, 8080, 8445, 8443, 8888, 8444, 3128, 1081} proxy_scraper = ProxyScraper(proxy_websites, target_ports) print_welcome() proxy_scraper.scrape_proxies_threaded() valid_proxy_count = len(proxy_scraper.valid_proxies) invalid_proxy_count = len(proxy_scraper.invalid_proxies) total_errors = proxy_scraper.errors successful_sites = proxy_scraper.successful_sites failed_sites = proxy_scraper.failed_sites proxy_summary = [ {"Category": "\033[1m\033[92mValid Proxies\033[0m", "Count": valid_proxy_count}, {"Category": "\033[1m\033[91mInvalid Proxies\033[0m", "Count": invalid_proxy_count}, {"Category": "\033[1m\033[96mTotal Proxies\033[0m", "Count": valid_proxy_count + invalid_proxy_count}, {"Category": "\033[1mSuccessful Sites\033[0m", "Count": successful_sites}, {"Category": "\033[1mFailed Sites\033[0m", "Count": failed_sites}, {"Category": "\033[1mErrors\033[0m", "Count": total_errors} ] print("\n\033[1m📊 Proxy Summary:\033[0m") print(tabulate(proxy_summary, headers="keys")) if valid_proxy_count > 0: print("\n\033[1m✅ Valid Proxies:\033[0m") for proxy in proxy_scraper.valid_proxies: print(proxy) else: print("\n\033[1mℹ️ No valid proxies were found.\033[0m") if __name__ == "__main__": main() Details About This note is about … See Also Python Map of Content Python Code Tool - Python Tool - Python Flask Appendix Note created on 2024-04-26 and last modified on 2024-04-26. Backlinks LIST FROM [[Python - Proxied Web Scraping]] AND -"CHANGELOG" AND -"04-RESOURCES/Code/Python/Python - Proxied Web Scraping" (c) No Clocks, LLC | 2024