ScrapExplorer - apache.py

ScrapExplorer - apache.py
Home / tools / utils / DLx / src / drivers
Lines: 2 | Size: 2051 bytes
[Download] [Show on GitHub] [Search similar files] [Raw] [Raw (proxy)] 
[FILE BEGIN]
1# SPDX-License-Identifier: GPL-3.0
2# DLX
3#
4# Bulk download tool
5#
6# COPYRIGHT NOTICE
7# Copyright (C) 2025 0x4248 and contributors
8# Redistribution and use in source and binary forms, with or without
9# modification, are permitted provided that the license is not changed.
10#
11# This software is free and open source. Licensed under the GNU general
12# public license version 3.0 as published by the Free Software Foundation.
13
14import requests
15from bs4 import BeautifulSoup
16import urllib.parse
17
18def fetch_files_recursive(base_url, current_path="", visited=None):
19    if visited is None:
20        visited = set()
21
22    url = urllib.parse.urljoin(base_url, current_path)
23    normalized_path = urllib.parse.urljoin(base_url, current_path).rstrip("/")
24
25    if normalized_path in visited:
26        return []
27
28    visited.add(normalized_path)
29    print(f"Fetching index of {url}")
30
31    response = requests.get(url)
32    if response.status_code != 200:
33        print(f"Failed to retrieve {url}. Status code: {response.status_code}")
34        return []
35
36    soup = BeautifulSoup(response.content, "html.parser")
37    files = []
38
39    for item in soup.find_all("a"):
40        href = item.get("href")
41
42        if href is None or href.startswith("#") or href.startswith("?") or href.startswith("../") or href.startswith("mailto:") or href.startswith("javascript:") or href.startswith("ftp://") or href.endswith("/."):
43            continue
44
45        full_href = urllib.parse.urljoin(url, href)
46
47        parsed_href = urllib.parse.urlparse(full_href)
48        relative_path = parsed_href.path[len(urllib.parse.urlparse(base_url).path) :].lstrip("/")
49
50        if href.endswith("/"):
51            files.extend(fetch_files_recursive(base_url, relative_path, visited))
52        else:
53            files.append((full_href, relative_path))
54
55    return files
56
57class Driver:
58    def fetch(base_url, output):
59        result = fetch_files_recursive(base_url)
60        out = open(output, "w")
61        for full_url, relative_path in result:
62            out.write(f"{full_url}\t{relative_path}\n")
63
[FILE END]
(C) 2025 0x4248
(C) 2025 4248 Media and 4248 Systems, All part of 0x4248
See LICENCE files for more information. Not all files are by 0x4248 always check Licencing.