wallhaven爬虫

最近闲得发慌，逛网页时偶然点开了一个许久没有打开过的壁纸网站。记得以前爬过它的壁纸，但代码早就不知道丢哪去了。刚好有空，又想测试下AI写代码的能力，于是就有了这段代码——全程由AI生成，我顶多算个“监工”╮(╯▽╰)╭（ps：本代码由 ai 生成）
亲测这代码能跑起来，就是爬取量有限╮(╯＿╰)╭，毕竟藏着些bug，例如改了 id 就爬不了了……不过也懒得改了，随手生成的东西，丢去博客备份算惹，犯不着git push 了

import os
import sys
import logging
from time import sleep
from random import uniform
import requests
from lxml import html
from typing import Optional

# ------------------------- 配置类 -------------------------
class SpiderConfig:
    """爬虫基础配置"""
    def __init__(self):
        # 网络配置
        self.base_url = "https://wallhaven.cc/search"
        self.id = "48868"
        self.timeout = 15
        self.retries = 3
        self.delay = (1, 3)  # 随机延迟范围

        # 分页配置
        self.max_page_fallback = 100  # 最大页码备用值

        # 路径配置
        self.image_dir = "images"
        self.log_dir = "logs"
        self.log_file = "wallhaven.log"

        # 调试模式
        self.debug_mode = False

class ImprovedSpiderConfig(SpiderConfig):
    """增强配置"""
    def __init__(self):
        super().__init__()
        self.max_threads = 3
        self.proxy = None

# ------------------------- 日志配置 -------------------------
class UnicodeSafeStreamHandler(logging.StreamHandler):
    """安全处理控制台编码"""
    def emit(self, record):
        try:
            msg = self.format(record)
            stream = self.stream
            encoding = stream.encoding if stream.encoding else 'utf-8'
            msg = msg.encode(encoding, errors='replace').decode(encoding)
            stream.write(msg + self.terminator)
            self.flush()
        except Exception:
            self.handleError(record)

def setup_logging(config: ImprovedSpiderConfig):
    """初始化安全的日志系统"""
    log_format = '%(asctime)s - [%(levelname)s] %(message)s'
    date_format = '%Y-%m-%d %H:%M:%S'

    os.makedirs(config.log_dir, exist_ok=True)

    logger = logging.getLogger()
    logger.handlers.clear()

    # 文件处理器（UTF-8编码）
    file_handler = logging.FileHandler(
        filename=os.path.join(config.log_dir, config.log_file),
        encoding='utf-8'
    )
    file_handler.setFormatter(logging.Formatter(log_format, date_format))

    # 控制台处理器（编码安全）
    console_handler = UnicodeSafeStreamHandler()
    console_handler.setFormatter(logging.Formatter(log_format, date_format))

    log_level = logging.DEBUG if config.debug_mode else logging.INFO
    logger.setLevel(log_level)
    file_handler.setLevel(log_level)
    console_handler.setLevel(log_level)

    logger.addHandler(file_handler)
    logger.addHandler(console_handler)

# ------------------------- 日志符号 -------------------------
class LogMark:
    """安全日志标识符号"""
    START = "=== START ==="
    END = "=== END ==="
    PROCESS_PAGE = "[PROCESS PAGE]"
    FOUND_ITEMS = "[FOUND ITEMS]"
    DOWNLOAD_START = "[DOWNLOAD]"
    DOWNLOAD_SKIP = "[SKIP]"
    DOWNLOAD_SUCCESS = "[SUCCESS]"
    DOWNLOAD_FAIL = "[FAIL]"

# ------------------------- 核心爬虫 -------------------------
class WallHavenSpider:
    def __init__(self, config: ImprovedSpiderConfig):
        self.config = config
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Referer': 'https://wallhaven.cc/'
        })
        os.makedirs(self.config.image_dir, exist_ok=True)

    def _random_delay(self):
        sleep(uniform(*self.config.delay))

    def _request_with_retry(self, url: str) -> Optional[html.HtmlElement]:
        for attempt in range(self.config.retries + 1):
            try:
                response = self.session.get(url, timeout=self.config.timeout)
                response.raise_for_status()
                tree = html.fromstring(response.content)

                if self.config.debug_mode:
                    logging.debug(f"HTML Preview:\n{response.text[:200]}...")
                    logging.debug(f"Nodes found: {len(tree.xpath('//*'))}")

                if tree is None or len(tree) == 0:
                    logging.warning(f"Empty document: {url}")
                    return None
                return tree

            except requests.exceptions.RequestException as e:
                logging.warning(f"Request failed ({e.__class__.__name__}): {url}")
                if attempt == self.config.retries:
                    logging.error(f"Max retries reached: {url}")
                    return None
                self._random_delay()

    def _get_max_page(self, tree: html.HtmlElement) -> int:
        try:
            pagination = tree.xpath('//nav[contains(@class, "pagination")]')
            if pagination:
                page_buttons = pagination[0].xpath('.//a[contains(@class, "pagination-link")]/text()')
                numeric_pages = [int(p.strip()) for p in page_buttons if p.strip().isdigit()]

                if numeric_pages:
                    logging.debug(f"Numeric pages found: {numeric_pages}")
                    return max(numeric_pages)

                last_button = pagination[0].xpath('.//a[contains(@class, "pagination-link")][last()]')
                if last_button and 'href' in last_button[0].attrib:
                    last_page_url = last_button[0].attrib['href']
                    if 'page=' in last_page_url:
                        page_num = last_page_url.split('page=')[-1]
                        if page_num.isdigit():
                            logging.debug(f"Page from URL: {page_num}")
                            return int(page_num)

            header = tree.xpath('//header[h2[@class="section-header"]]/h2/text()')
            if header:
                parts = header[0].split('of')
                if len(parts) > 1:
                    total = parts[-1].strip()
                    if total.isdigit():
                        logging.debug(f"Total items: {total}")
                        return (int(total) - 1) // 24 + 1

            logging.warning("Using fallback page strategy")
            return self.config.max_page_fallback

        except Exception as e:
            logging.error(f"Page parse error: {str(e)}", exc_info=True)
            return self.config.max_page_fallback

    def _get_detail_urls(self, tree: html.HtmlElement) -> list:
        urls = tree.xpath('//section[contains(@class, "thumb-listing-page")]//a[contains(@class, "preview")]/@href')
        logging.debug(f"Detail URLs found: {len(urls)}")
        return urls

    def _get_image_url(self, tree: html.HtmlElement) -> Optional[str]:
        url = tree.xpath('//img[@id="wallpaper"]/@src')
        if url:
            logging.debug(f"Image URL found: {url[0]}")
            return url[0]
        logging.warning("Image URL not found")
        return None

    def _download_image(self, url: str):
        try:
            filename = os.path.basename(url.split('?')[0])
            save_path = os.path.join(self.config.image_dir, filename)

            if os.path.exists(save_path):
                logging.info(f"{LogMark.DOWNLOAD_SKIP} {filename}")
                return

            logging.info(f"{LogMark.DOWNLOAD_START} {filename}")

            with self.session.get(url, stream=True, timeout=self.config.timeout) as response:
                response.raise_for_status()

                file_size = int(response.headers.get('Content-Length', 0))
                progress = 0

                with open(save_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                            progress += len(chunk)
                            if file_size > 0:
                                logging.debug(f"Progress: {progress}/{file_size} ({progress/file_size:.1%})")

                logging.info(f"{LogMark.DOWNLOAD_SUCCESS} {filename} ({progress/1024:.1f}KB)")

        except Exception as e:
            logging.error(f"{LogMark.DOWNLOAD_FAIL} {url}: {str(e)}")

    def run(self):
        logging.info(LogMark.START)

        base_url = f"{self.config.base_url}?q=id:{self.config.id}"
        logging.debug(f"Base URL: {base_url}")

        initial_tree = self._request_with_retry(base_url)
        if initial_tree is None:
            logging.error("Initial request failed")
            return

        max_page = self._get_max_page(initial_tree)
        logging.debug(f"Max pages calculated: {max_page}")

        for page in range(1, max_page + 1):
            page_url = f"{base_url}&page={page}" if page > 1 else base_url
            logging.info(f"{LogMark.PROCESS_PAGE} {page}: {page_url}")

            tree = self._request_with_retry(page_url)
            if tree is None:
                logging.warning(f"Skip invalid page: {page}")
                continue

            detail_urls = self._get_detail_urls(tree)
            logging.info(f"{LogMark.FOUND_ITEMS} {len(detail_urls)}")

            for idx, url in enumerate(detail_urls, 1):
                self._random_delay()
                logging.debug(f"Processing {idx}/{len(detail_urls)}: {url}")
                detail_tree = self._request_with_retry(url)

                if detail_tree:
                    if image_url := self._get_image_url(detail_tree):
                        self._download_image(image_url)
                else:
                    logging.warning(f"Invalid detail page: {url}")

        logging.info(LogMark.END)

if __name__ == "__main__":
    # 初始化配置
    config = ImprovedSpiderConfig()
    config.id = "48868"
    config.delay = (0.5, 1.2)
    config.debug_mode = False
    config.max_page_fallback = 6

    # 配置日志系统
    setup_logging(config)

    # 运行爬虫
    spider = WallHavenSpider(config)
    spider.run()
技术教程
#Python #爬虫 #wallhaven #壁纸爬虫
wallhaven爬虫
https://blog.qfmy.vip/wallhaven-web-crawler-guide/
作者
Harrison
发布于
2025年6月22日
许可协议
如何使用hexo-encrypt插件对Hexo博客进行加密上一篇
Termux安装ssh和设置sshd自启下一篇