ScrapExplorer - apache.py

Home / tools / utils / DLx / src / drivers Lines: 2 | Size: 2051 bytes [Download] [Show on GitHub] [Search similar files] [Raw] [Raw (proxy)]
[FILE BEGIN]
1# SPDX-License-Identifier: GPL-3.0 2# DLX 3# 4# Bulk download tool 5# 6# COPYRIGHT NOTICE 7# Copyright (C) 2025 0x4248 and contributors 8# Redistribution and use in source and binary forms, with or without 9# modification, are permitted provided that the license is not changed. 10# 11# This software is free and open source. Licensed under the GNU general 12# public license version 3.0 as published by the Free Software Foundation. 13 14import requests 15from bs4 import BeautifulSoup 16import urllib.parse 17 18def fetch_files_recursive(base_url, current_path="", visited=None): 19 if visited is None: 20 visited = set() 21 22 url = urllib.parse.urljoin(base_url, current_path) 23 normalized_path = urllib.parse.urljoin(base_url, current_path).rstrip("/") 24 25 if normalized_path in visited: 26 return [] 27 28 visited.add(normalized_path) 29 print(f"Fetching index of {url}") 30 31 response = requests.get(url) 32 if response.status_code != 200: 33 print(f"Failed to retrieve {url}. Status code: {response.status_code}") 34 return [] 35 36 soup = BeautifulSoup(response.content, "html.parser") 37 files = [] 38 39 for item in soup.find_all("a"): 40 href = item.get("href") 41 42 if href is None or href.startswith("#") or href.startswith("?") or href.startswith("../") or href.startswith("mailto:") or href.startswith("javascript:") or href.startswith("ftp://") or href.endswith("/."): 43 continue 44 45 full_href = urllib.parse.urljoin(url, href) 46 47 parsed_href = urllib.parse.urlparse(full_href) 48 relative_path = parsed_href.path[len(urllib.parse.urlparse(base_url).path) :].lstrip("/") 49 50 if href.endswith("/"): 51 files.extend(fetch_files_recursive(base_url, relative_path, visited)) 52 else: 53 files.append((full_href, relative_path)) 54 55 return files 56 57class Driver: 58 def fetch(base_url, output): 59 result = fetch_files_recursive(base_url) 60 out = open(output, "w") 61 for full_url, relative_path in result: 62 out.write(f"{full_url}\t{relative_path}\n") 63
[FILE END]
(C) 2025 0x4248 (C) 2025 4248 Media and 4248 Systems, All part of 0x4248 See LICENCE files for more information. Not all files are by 0x4248 always check Licencing.