京东商品批量采集系统:整店下载、SKU图提取与主图视频下载
引言
很多做电商的朋友在问:“能下载京东主图视频的软件推荐”“京东整店商品图片批量保存工具”“支持京东主图视频下载的软件有吗?”
京东是国内三大电商平台之一,商品包含主图、SKU规格图(颜色/尺寸)、详情图、主图视频等,手动采集效率极低。店铺商品众多,整店采集需求量大。
本文将完整实现一套京东商品批量采集系统,涵盖整店商品列表获取、主图提取、SKU图识别、主图视频下载、详情图提取、自动分类、断点续传等核心功能。一键存图正是基于这套技术实现的,下载的是原图、原尺寸、原格式,无任何压缩、无水印、无MD5篡改。
一、京东平台技术特点分析
1.1 核心难点
| 难点 | 说明 | 解决方案 |
|---|---|---|
| 整店采集 | 需要遍历所有分页 | 分页解析+队列管理 |
| SKU图 | 颜色/尺寸规格图 | 智能识别SKU容器 |
| 主图视频 | mp4直链/m3u8格式 | 视频嗅探+m3u8下载 |
| 懒加载 | 滚动触发图片加载 | 自动滚动触发 |
| 反爬机制 | 检测非浏览器访问 | 浏览器方案,真实指纹 |
1.2 京东图片URL格式
python
# 京东图片URL示例 # 原图格式 https://img13.360buyimg.com/n0/xxx.jpg https://img14.360buyimg.com/popWaterMark/xxx.jpg # 缩略图格式(需要转换) https://img13.360buyimg.com/n1/xxx.jpg https://img13.360buyimg.com/n2/xxx.jpg # 原图规则:替换为n0或去除尺寸参数
二、京东整店商品列表获取
python
# jd_shop_parser.py import re import time from typing import List, Dict class JDShopParser: """京东店铺商品列表解析器""" def __init__(self, browser_engine): self.browser = browser_engine def get_all_product_urls(self, shop_url: str, max_pages: int = 100) -> List[str]: """获取京东店铺所有商品链接""" all_urls = [] for page in range(1, max_pages + 1): # 京东店铺分页URL格式 page_url = f"{shop_url}/search-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-{page}.html" print(f"解析第{page}页: {page_url}") browser = self.browser.CreateBrowser(page_url) if not self._wait_for_load(browser, 10): break script = """ (function() { const urls = []; const selectors = [ '.gl-item .p-img a', '.J_ItemPic a', '.product-item a' ]; for (const selector of selectors) { const elements = document.querySelectorAll(selector); for (const el of elements) { let href = el.href; if (href && href.includes('item.jd.com')) { urls.push(href); } } if (urls.length > 0) break; } return urls; })(); """ new_urls = self._execute_script(browser, script) if not new_urls: break all_urls.extend(new_urls) print(f"第{page}页: {len(new_urls)}个商品") time.sleep(2) all_urls = list(set(all_urls)) print(f"共发现{len(all_urls)}个商品") return all_urls def _wait_for_load(self, browser, timeout: int) -> bool: start = time.time() while time.time() - start < timeout: script = "document.readyState === 'complete'" if self._execute_script(browser, script): return True time.sleep(0.5) return False def _execute_script(self, browser, script: str): pass三、京东商品解析引擎
javascript
// jd_product_extractor.js (function() { 'use strict'; /** * 京东商品解析器 * 支持主图、SKU图、视频、详情图提取 */ class JDProductExtractor { constructor() { this.result = { title: '', mainImages: [], skuImages: [], // SKU规格图(颜色/尺寸) detailImages: [], videos: [] }; this.seenUrls = new Set(); } async waitForPageReady() { while (document.readyState !== 'complete') { await this.sleep(200); } await this.waitForImageContainer(); await this.sleep(1000); } async waitForImageContainer() { let maxWait = 30; while (maxWait-- > 0) { if (document.querySelector('.spec-img, .J_zoomPic')) { return; } await this.sleep(500); } } sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async triggerLazyLoad() { window.scrollTo(0, document.body.scrollHeight); await this.sleep(500); const step = document.body.scrollHeight / 5; for (let i = 1; i <= 5; i++) { window.scrollTo(0, i * step); await this.sleep(200); } window.scrollTo(0, 0); await this.sleep(300); } getOriginalUrl(url) { if (!url) return null; if (url.startsWith('data:')) return null; if (url.includes('1x1') || url.includes('blank')) return null; url = url.split('?')[0]; // n1/n2 -> n0 (原图) url = url.replace(/\/n\d\//, '/n0/'); url = url.replace(/_\d+x\d+\./g, '.'); return url; } extractTitle() { const selectors = ['.sku-name', '.product-title', 'h1']; for (const s of selectors) { const el = document.querySelector(s); if (el && el.textContent) { let title = el.textContent.trim(); if (title.length > 5) return title; } } return document.title || '京东商品'; } extractMainImages() { const images = []; // 主图 const mainImg = document.querySelector('.spec-img, .J_zoomPic'); if (mainImg) { let url = mainImg.src || mainImg.getAttribute('data-lazy-img'); if (url) { const original = this.getOriginalUrl(url); if (original && !this.seenUrls.has(original)) { this.seenUrls.add(original); images.push(original); } } } // 缩略图列表 const thumbs = document.querySelectorAll('.spec-thumb img, .J_thumImg'); for (const thumb of thumbs) { let url = thumb.src || thumb.getAttribute('data-lazy-img'); if (url) { const original = this.getOriginalUrl(url); if (original && !this.seenUrls.has(original)) { this.seenUrls.add(original); if (!images.includes(original)) { images.push(original); } } } } return images; } extractSkuImages() { const skuImages = []; const skuContainer = document.querySelector('.sku-img-list, .J_skuImgList'); if (skuContainer) { const items = skuContainer.querySelectorAll('.sku-img-item, .J_skuImgItem'); for (const item of items) { let name = ''; const nameEl = item.querySelector('.sku-name, .J_skuName'); if (nameEl) name = nameEl.textContent?.trim(); if (!name) name = item.getAttribute('title') || '规格'; const img = item.querySelector('img'); if (img) { let url = img.src || img.getAttribute('data-lazy-img'); if (url) { const original = this.getOriginalUrl(url); if (original && !this.seenUrls.has(original)) { this.seenUrls.add(original); skuImages.push({ url: original, name: name }); } } } } } return skuImages; } extractDetailImages() { const images = []; const container = document.querySelector('#detail, .detail-content, .J_detailContent'); if (container) { const imgs = container.querySelectorAll('img'); for (const img of imgs) { let url = img.src || img.getAttribute('data-lazy-img'); if (url) { const original = this.getOriginalUrl(url); if (original && !this.seenUrls.has(original)) { this.seenUrls.add(original); images.push(original); } } } } return images; } extractVideo() { // video标签 const video = document.querySelector('.JDV-video video, .video-box video'); if (video && video.src) { return { url: video.src, type: video.src.endsWith('.mp4') ? 'mp4' : 'm3u8' }; } // 页面数据 if (window.pageConfig && window.pageConfig.product && window.pageConfig.product.videoUrl) { return { url: window.pageConfig.product.videoUrl, type: 'mp4' }; } return null; } async extract() { await this.waitForPageReady(); await this.triggerLazyLoad(); this.result.title = this.extractTitle(); this.result.mainImages = this.extractMainImages(); this.result.skuImages = this.extractSkuImages(); this.result.detailImages = this.extractDetailImages(); const video = this.extractVideo(); if (video) this.result.videos.push(video); return this.result; } } return new JDProductExtractor().extract(); })();四、m3u8视频下载器
python
# m3u8_downloader.py import os, time, requests, m3u8 from concurrent.futures import ThreadPoolExecutor class M3U8Downloader: def __init__(self): self.headers = {'User-Agent': 'Mozilla/5.0', 'Referer': 'https://item.jd.com/'} def download(self, m3u8_url, output_path): playlist = m3u8.load(m3u8_url, headers=self.headers) base_url = '/'.join(m3u8_url.split('/')[:-1]) + '/' segments = [seg.uri if seg.uri.startswith('http') else base_url + seg.uri for seg in playlist.segments] temp_dir = f"temp_{int(time.time())}" os.makedirs(temp_dir, exist_ok=True) ts_files = [] with ThreadPoolExecutor(max_workers=10) as executor: futures = [] for i, ts_url in enumerate(segments): ts_path = os.path.join(temp_dir, f"seg_{i:05d}.ts") futures.append(executor.submit(self._download_ts, ts_url, ts_path)) ts_files.append(ts_path) for f in futures: f.result() with open(output_path, 'wb') as out: for ts in ts_files: if os.path.exists(ts): with open(ts, 'rb') as f: out.write(f.read()) for ts in ts_files: os.remove(ts) os.rmdir(temp_dir) return True def _download_ts(self, url, path): for _ in range(3): try: resp = requests.get(url, headers=self.headers, timeout=30) if resp.status_code == 200: with open(path, 'wb') as f: f.write(resp.content) return True except: time.sleep(1) return False五、批量采集调度器
python
# jd_batch_collector.py import os, json, time, threading, re from queue import Queue from dataclasses import dataclass, asdict from datetime import datetime @dataclass class JDProductData: url: str; sku_id: str; title: str main_images: list; sku_images: list; detail_images: list; videos: list success: bool = True; error: str = None; timestamp: str = None def __post_init__(self): if not self.timestamp: self.timestamp = datetime.now().isoformat() class JDBatchCollector: def __init__(self, output_dir='./downloads/jd'): self.output_dir = output_dir; self.queue = Queue(); self.results = [] self.lock = threading.Lock(); self.completed_ids = set() self.state_file = "jd_batch_state.json"; self._load_state() def _load_state(self): if os.path.exists(self.state_file): try: with open(self.state_file, 'r') as f: self.completed_ids = set(json.load(f).get('completed_ids', [])) print(f"📁 加载断点: 已完成{len(self.completed_ids)}个商品") except: pass def _save_state(self): with self.lock: with open(self.state_file, 'w') as f: json.dump({'completed_ids': list(self.completed_ids), 'last_update': datetime.now().isoformat()}, f, indent=2) def add_urls(self, urls): for url in urls: sid = re.search(r'/(\d+)\.html', url) sid = sid.group(1) if sid else '' if sid and sid not in self.completed_ids: self.queue.put({'url': url, 'sku_id': sid}) print(f"📋 队列中有{self.queue.qsize()}个待处理商品") def collect_all(self, collector_func): print(f"🚀 开始批量采集") threads = [threading.Thread(target=self._worker, args=(collector_func,)) for _ in range(1)] for t in threads: t.start() for t in threads: t.join() self._save_results() return self.results def _worker(self, collector_func): while not self.queue.empty(): try: task = self.queue.get(timeout=1) print(f"📦 采集商品: {task['sku_id']}") result = collector_func(task['url']) result.sku_id = task['sku_id'] with self.lock: self.results.append(result) if result.success: self.completed_ids.add(task['sku_id']) self._save_state() success_count = sum(1 for r in self.results if r.success) print(f"📊 进度: {len(self.results)}个商品, 成功: {success_count}") time.sleep(2) except: pass def _save_results(self): file = f"jd_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json" with open(file, 'w', encoding='utf-8') as f: json.dump([asdict(r) for r in self.results], f, ensure_ascii=False, indent=2)六、保存目录结构
text
downloads/jd/ ├── 100012345678_2024新款女装/ │ ├── 主图/ │ │ ├── 主图_1.jpg │ │ ├── 主图_2.jpg │ │ └── 主图_3.jpg │ ├── SKU图/ │ │ ├── 红色.jpg │ │ ├── 蓝色.jpg │ │ └── 黑色.jpg │ ├── 详情图/ │ │ └── 详情图_1.jpg │ ├── 视频/ │ │ └── 视频.mp4 │ └── 商品信息.json └── jd_results_20250101_120000.json
七、总结
| 模块 | 功能 |
|---|---|
| 整店解析 | 获取店铺所有商品链接 |
| SKU识别 | 颜色/尺寸规格图自动分类 |
| 视频下载 | mp4/m3u8格式支持 |
| 批量调度 | 队列+断点续传 |
核心要点:基于Chromium浏览器内核,下载的是京东的原图、原尺寸、原格式
结论:如果你需要一款稳定、自动分类、支持全平台的电商图片下载工具,一键存图是目前最省心的选择。
百度搜索“一键存图”或“火蚁一键存图”即可找到。
