当前位置：首页 > news >正文

Python xhs SDK：突破性小红书数据采集的3个高效方案

news 2026/6/12 22:59:32

Python xhs SDK：突破性小红书数据采集的3个高效方案

【免费下载链接】xhs基于小红书 Web 端进行的请求封装。https://reajason.github.io/xhs/项目地址: https://gitcode.com/gh_mirrors/xh/xhs

在小红书内容生态日益繁荣的今天，如何高效、稳定地获取平台公开数据成为数据分析师和开发者面临的核心挑战。传统爬虫方案不仅维护成本高，还面临频繁的签名验证和反爬虫机制。xhs Python SDK通过智能封装小红书Web端请求，提供了三种突破性解决方案，让数据采集变得简单高效。

为什么选择xhs SDK？三大核心优势对比

特性	xhs SDK方案	传统爬虫方案	手动采集方案
开发效率	5分钟快速集成	2-3周开发周期	人工操作，效率低
稳定性	自动签名验证，成功率95%+	频繁失效，维护成本高	依赖人工，不稳定
功能完整性	完整API覆盖	功能碎片化	功能有限
学习曲线	简单Python接口	复杂逆向工程	无需技术
合规性	合理频率控制	高风险	合规但低效

方案一：三步搞定小红书笔记数据采集

1. 环境配置与快速安装

# 安装xhs SDK pip install xhs # 或从源码安装最新版本 git clone https://gitcode.com/gh_mirrors/xh/xhs cd xhs pip install -e .

2. 核心客户端初始化

from xhs import XhsClient # 基础初始化 cookie = "your_xhs_cookie_string" xhs_client = XhsClient(cookie) # 高级配置：代理和超时设置 xhs_client = XhsClient( cookie=cookie, proxies={"http": "http://proxy.example.com:8080"}, timeout=30, user_agent="Mozilla/5.0 Custom-UA" )

3. 实战：批量采集热门笔记

import json from datetime import datetime def collect_popular_notes(keywords, limit=50): """采集指定关键词的热门笔记""" results = [] for keyword in keywords: search_results = xhs_client.search( keyword=keyword, sort="popularity_descending", note_type="all" ) for note in search_results['items'][:limit]: note_detail = xhs_client.get_note_by_id(note['id']) results.append({ 'keyword': keyword, 'note_id': note_detail['id'], 'title': note_detail['title'], 'author': note_detail['user']['nickname'], 'likes': note_detail['likes'], 'collects': note_detail['collects'], 'publish_time': datetime.fromtimestamp(note_detail['time']/1000), 'tags': note_detail['tag_list'] }) return results

方案二：智能内容分类与趋势分析系统

内容分类深度解析

xhs SDK内置了小红书完整的内容分类体系，通过FeedType枚举类支持12个主要内容领域：

from xhs import FeedType # 获取不同分类的热门内容 def analyze_category_trends(): trends_data = {} # 美食内容分析 food_feed = xhs_client.get_home_feed(feed_type=FeedType.FOOD) trends_data['food'] = extract_trends(food_feed) # 穿搭内容分析 fashion_feed = xhs_client.get_home_feed(feed_type=FeedType.FASION) trends_data['fashion'] = extract_trends(fashion_feed) # 旅行内容分析 travel_feed = xhs_client.get_home_feed(feed_type=FeedType.TRAVEL) trends_data['travel'] = extract_trends(travel_feed) return trends_data

趋势分析实战案例

import pandas as pd from collections import Counter class ContentTrendAnalyzer: def __init__(self, xhs_client): self.client = xhs_client self.trend_data = [] def collect_daily_trends(self, category, days=7): """收集指定分类的7天趋势数据""" for day in range(days): feed_data = self.client.get_home_feed( feed_type=getattr(FeedType, category.upper()) ) for note in feed_data['items'][:100]: # 取前100条 self.trend_data.append({ 'date': datetime.now().date(), 'category': category, 'title': note['title'], 'engagement': note['likes'] + note['collects'], 'keywords': self.extract_keywords(note['title']) }) return self.analyze_trends() def analyze_trends(self): """分析趋势数据""" df = pd.DataFrame(self.trend_data) # 计算每日趋势变化 daily_stats = df.groupby(['date', 'category']).agg({ 'engagement': 'mean' }).reset_index() # 提取热门关键词 all_keywords = [] for kw_list in df['keywords']: all_keywords.extend(kw_list) top_keywords = Counter(all_keywords).most_common(20) return { 'daily_trends': daily_stats.to_dict('records'), 'top_keywords': top_keywords, 'category_comparison': df.groupby('category')['engagement'].mean().to_dict() }

方案三：企业级数据采集架构设计

高性能异步采集架构

import asyncio import aiohttp from concurrent.futures import ThreadPoolExecutor class AsyncXhsCollector: def __init__(self, cookie, max_concurrent=10): self.cookie = cookie self.max_concurrent = max_concurrent self.session = None async def collect_notes_batch(self, note_ids): """批量异步采集笔记数据""" semaphore = asyncio.Semaphore(self.max_concurrent) async with aiohttp.ClientSession() as session: tasks = [] for note_id in note_ids: task = asyncio.create_task( self.fetch_note_with_semaphore(session, note_id, semaphore) ) tasks.append(task) results = await asyncio.gather(*tasks, return_exceptions=True) return [r for r in results if not isinstance(r, Exception)] async def fetch_note_with_semaphore(self, session, note_id, semaphore): """带信号量控制的笔记获取""" async with semaphore: # 这里需要实现异步版本的xhs客户端 # 实际应用中需要封装异步请求 await asyncio.sleep(0.1) # 控制请求频率 return await self._async_get_note(note_id)

数据存储与缓存策略

import sqlite3 import hashlib from datetime import datetime, timedelta class XhsDataCache: def __init__(self, db_path="xhs_cache.db"): self.db_path = db_path self.init_cache_db() def init_cache_db(self): """初始化缓存数据库""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS note_cache ( note_id TEXT PRIMARY KEY, data TEXT NOT NULL, category TEXT, fetch_time TIMESTAMP, expire_time TIMESTAMP, hash_key TEXT ) ''') cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_category ON note_cache(category) ''') cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_expire ON note_cache(expire_time) ''') conn.commit() conn.close() def cache_note(self, note_data, category=None, ttl_hours=24): """缓存笔记数据""" note_id = note_data['id'] data_json = json.dumps(note_data, ensure_ascii=False) hash_key = hashlib.md5(data_json.encode()).hexdigest() conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute(''' INSERT OR REPLACE INTO note_cache (note_id, data, category, fetch_time, expire_time, hash_key) VALUES (?, ?, ?, ?, ?, ?) ''', ( note_id, data_json, category, datetime.now(), datetime.now() + timedelta(hours=ttl_hours), hash_key )) conn.commit() conn.close()

避坑指南：小红书数据采集的5个关键点

1. 签名验证的智能处理

xhs SDK已经内置了签名验证机制，但在实际使用中需要注意：

# 正确的签名配置方式 def create_robust_client(): """创建健壮的xhs客户端""" client = XhsClient( cookie="your_cookie", sign=sign, # 使用内置签名函数 timeout=30, proxies={ 'http': 'http://proxy:8080', 'https': 'http://proxy:8080' } ) return client # 错误处理策略 def safe_fetch_with_retry(note_id, max_retries=3): """带重试的安全获取""" for attempt in range(max_retries): try: return xhs_client.get_note_by_id(note_id) except Exception as e: if attempt == max_retries - 1: raise wait_time = 2 ** attempt # 指数退避 print(f"第{attempt+1}次尝试失败，等待{wait_time}秒后重试") time.sleep(wait_time)

2. 请求频率的合理控制

操作类型	建议频率	并发数	注意事项
单笔记查询	1-2次/秒	1	避免高频查询同一笔记
批量采集	5-10次/分钟	3-5	添加随机延迟
搜索操作	2-3次/秒	2	控制关键词数量
分类浏览	1次/3秒	1	按需获取，避免全量

3. Cookie管理的实战经验

class CookieManager: def __init__(self, cookie_file="cookies.json"): self.cookie_file = cookie_file self.cookies = self.load_cookies() def load_cookies(self): """加载Cookie配置""" if os.path.exists(self.cookie_file): with open(self.cookie_file, 'r') as f: return json.load(f) return [] def rotate_cookie(self): """轮换Cookie策略""" if not self.cookies: raise ValueError("没有可用的Cookie") # 简单的轮换策略 current = self.cookies.pop(0) self.cookies.append(current) return current def validate_cookie(self, cookie): """验证Cookie有效性""" try: test_client = XhsClient(cookie, timeout=5) test_result = test_client.get_note_by_id("test_note_id") return test_result is not None except: return False

扩展生态：与其他工具的集成方案

1. 与Pandas的数据分析集成

import pandas as pd import numpy as np class XhsDataFrameBuilder: def __init__(self, xhs_client): self.client = xhs_client def build_user_analysis_df(self, user_ids): """构建用户分析DataFrame""" user_data = [] for user_id in user_ids: user_info = self.client.get_user_info(user_id) user_notes = self.client.get_user_notes(user_id) user_data.append({ 'user_id': user_id, 'nickname': user_info['nickname'], 'fans_count': user_info['fans_count'], 'notes_count': user_info['notes_count'], 'avg_likes': np.mean([n['likes'] for n in user_notes]), 'avg_collects': np.mean([n['collects'] for n in user_notes]), 'engagement_rate': self.calculate_engagement_rate(user_notes) }) return pd.DataFrame(user_data) def calculate_engagement_rate(self, notes): """计算用户互动率""" if not notes: return 0 total_interactions = sum(n['likes'] + n['collects'] for n in notes) total_views = sum(n.get('views', 0) for n in notes) return total_interactions / total_views if total_views > 0 else 0

2. 与Elasticsearch的搜索集成

from elasticsearch import Elasticsearch class XhsElasticsearchIndexer: def __init__(self, es_host="localhost:9200"): self.es = Elasticsearch([es_host]) self.index_name = "xhs_notes" def create_index_mapping(self): """创建Elasticsearch索引映射""" mapping = { "mappings": { "properties": { "note_id": {"type": "keyword"}, "title": {"type": "text", "analyzer": "ik_max_word"}, "content": {"type": "text", "analyzer": "ik_max_word"}, "author": {"type": "keyword"}, "likes": {"type": "integer"}, "collects": {"type": "integer"}, "publish_time": {"type": "date"}, "tags": {"type": "keyword"}, "category": {"type": "keyword"} } } } if not self.es.indices.exists(index=self.index_name): self.es.indices.create(index=self.index_name, body=mapping) def index_note(self, note_data): """索引笔记数据到Elasticsearch""" doc = { 'note_id': note_data['id'], 'title': note_data['title'], 'content': note_data.get('desc', ''), 'author': note_data['user']['nickname'], 'likes': note_data['likes'], 'collects': note_data['collects'], 'publish_time': datetime.fromtimestamp(note_data['time']/1000), 'tags': [tag['name'] for tag in note_data.get('tag_list', [])], 'category': self.detect_category(note_data) } self.es.index(index=self.index_name, id=note_data['id'], body=doc)

性能优化实战：从基础到高级

连接池与会话复用

import requests from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry class OptimizedXhsClient: def __init__(self, cookie, max_pool_size=10): self.cookie = cookie self.session = self.create_optimized_session(max_pool_size) def create_optimized_session(self, max_pool_size): """创建优化的请求会话""" session = requests.Session() # 配置连接池 adapter = HTTPAdapter( pool_connections=max_pool_size, pool_maxsize=max_pool_size, max_retries=Retry( total=3, backoff_factor=0.5, status_forcelist=[500, 502, 503, 504] ) ) session.mount('http://', adapter) session.mount('https://', adapter) # 设置通用请求头 session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'application/json', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive' }) return session def batch_fetch_notes(self, note_ids, batch_size=20): """批量获取笔记数据""" results = [] for i in range(0, len(note_ids), batch_size): batch = note_ids[i:i+batch_size] batch_results = [] for note_id in batch: try: note_data = self.get_note_by_id(note_id) batch_results.append(note_data) except Exception as e: print(f"获取笔记{note_id}失败: {e}") batch_results.append(None) results.extend(batch_results) time.sleep(1) # 批次间延迟 return results

内存优化策略

import gc from dataclasses import dataclass from typing import Optional @dataclass class NoteSummary: """轻量级笔记摘要，减少内存占用""" note_id: str title: str author: str likes: int collects: int publish_date: str tags: tuple @classmethod def from_full_note(cls, full_note): """从完整笔记数据创建摘要""" return cls( note_id=full_note['id'], title=full_note['title'][:100], # 限制标题长度 author=full_note['user']['nickname'], likes=full_note['likes'], collects=full_note['collects'], publish_date=datetime.fromtimestamp(full_note['time']/1000).strftime('%Y-%m-%d'), tags=tuple(tag['name'] for tag in full_note.get('tag_list', [])[:5]) ) class MemoryEfficientCollector: def __init__(self, xhs_client): self.client = xhs_client self.collected_notes = [] def collect_with_memory_control(self, note_ids, max_memory_mb=100): """带内存控制的采集""" for note_id in note_ids: note_data = self.client.get_note_by_id(note_id) note_summary = NoteSummary.from_full_note(note_data) self.collected_notes.append(note_summary) # 定期清理内存 if len(self.collected_notes) % 100 == 0: self.cleanup_memory() # 检查内存使用 current_memory = self.get_memory_usage_mb() if current_memory > max_memory_mb: self.save_and_clear() return self.collected_notes def cleanup_memory(self): """清理内存""" gc.collect()

快速验证：立即体验xhs SDK的强大功能

1. 一分钟快速启动

# 最简单的验证脚本 from xhs import XhsClient # 初始化客户端（需要有效的Cookie） cookie = "your_xhs_cookie_here" client = XhsClient(cookie) # 测试基本功能 test_note_id = "6505318c000000001f03c5a6" # 示例笔记ID note = client.get_note_by_id(test_note_id) print(f"笔记标题: {note['title']}") print(f"作者: {note['user']['nickname']}") print(f"点赞数: {note['likes']}") print(f"收藏数: {note['collects']}")

2. 功能验证清单

功能模块	验证方法	预期结果
笔记获取	get_note_by_id()	返回完整的笔记数据
搜索功能	search(keyword="测试")	返回相关搜索结果
分类浏览	get_home_feed(feed_type)	返回分类内容列表
用户信息	get_user_info(user_id)	返回用户基本信息
图片提取	get_imgs_url_from_note()	返回图片URL列表

3. 性能基准测试

import time from statistics import mean def benchmark_xhs_performance(): """性能基准测试""" operations = [ ("单笔记查询", lambda: client.get_note_by_id(test_note_id)), ("搜索操作", lambda: client.search(keyword="Python", limit=10)), ("分类获取", lambda: client.get_home_feed(feed_type="recommend")) ] results = {} for op_name, op_func in operations: times = [] for _ in range(5): # 运行5次取平均 start = time.time() op_func() times.append(time.time() - start) results[op_name] = { 'avg_time': mean(times), 'min_time': min(times), 'max_time': max(times) } return results