# coding: utf-8
import hashlib
import os
import time
import requests
from requests.cookies import cookielib
# TODO: 抽象一个基类
[文档]class PublicAccountsWeb(object):
"""通过微信公众号网页版抓取链接,或者公众号信息"""
def __init__(self, cookie, token, proxies={"http": None, "https": None}):
"""
Parameters
----------
token : str
登录微信公众号平台之后获取的token
cookie : str
登录微信公众号平台之后获取的cookie
"""
self.s = requests.session()
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"
}
self.params = {
"lang": "zh_CN",
"f": "json",
}
# 手动输入cookie和token登录
self.__verify_str(cookie, "cookie")
self.__verify_str(token, "token")
self.headers["Cookie"] = cookie
self.params["token"] = token
self.proxies = proxies
def __verify_str(self, input_string, param_name):
"""
验证输入是否为字符串
Parameters
----------
input_string: str
输入
param_name: str
需要验证的参数名
"""
if not isinstance(input_string, str):
raise TypeError("{} must be an instance of str".format(param_name))
def __save_login_qrcode(self, img):
"""
存储和显示登录二维码
Parameters
----------
img: str
获取到的二维码数据
"""
import matplotlib.pyplot as plt
from PIL import Image
# 存储二维码
with open("login.png", "wb+") as fp:
fp.write(img.content)
# 显示二维码, 这里使用plt的原因是: 等待用户扫描完之后手动关闭窗口继续运行;否则会直接运行
try:
img = Image.open("login.png")
except Exception:
raise TypeError(u"账号密码输入错误,请重新输入")
plt.figure()
plt.imshow(img)
plt.show()
def __save_cookie(self, username):
"""
存储cookies, username用于文件命名
Parameters
----------
username: str
用户账号
"""
# 实例化一个LWPcookiejar对象
new_cookie_jar = cookielib.LWPCookieJar(username + ".txt")
# 将转换成字典格式的RequestsCookieJar(这里我用字典推导手动转的)保存到LWPcookiejar中
requests.utils.cookiejar_from_dict(
{c.name: c.value for c in self.s.cookies}, new_cookie_jar
)
# 保存到本地文件
new_cookie_jar.save(
"cookies/" + username + ".txt", ignore_discard=True, ignore_expires=True
)
def __read_cookie(self, username):
"""
读取cookies, username用于文件命名
Parameters
----------
username: str
用户账号
"""
# 实例化一个LWPCookieJar对象
load_cookiejar = cookielib.LWPCookieJar()
# 从文件中加载cookies(LWP格式)
load_cookiejar.load(
"cookies/" + username + ".txt", ignore_discard=True, ignore_expires=True
)
# 工具方法转换成字典
load_cookies = requests.utils.dict_from_cookiejar(load_cookiejar)
# 工具方法将字典转换成RequestsCookieJar,赋值给session的cookies.
self.s.cookies = requests.utils.cookiejar_from_dict(load_cookies)
def __md5_passwd(self, password):
"""
微信公众号的登录密码需要用md5方式进行加密
Parameters
----------
password: str
加密前的字符串
Returns
-------
str:
加密后的字符串
"""
m5 = hashlib.md5()
m5.update(password.encode("utf-8"))
pwd = m5.hexdigest()
return pwd
def __startlogin_official(self, username, password):
"""
获取登录二维码,进而获取Cookies
Parameters
----------
username: str
用户账号
password: str
用户密码
Returns
-------
None
"""
# 进行md5加密,一些post的参数
pwd = self.__md5_passwd(password)
data = {
"username": username,
"userlang": "zh_CN",
"token": "",
"pwd": pwd,
"lang": "zh_CN",
"imgcode": "",
"f": "json",
"ajax": "1",
}
# 增加headers的keys
self.headers["Host"] = "mp.weixin.qq.com"
self.headers["Origin"] = "https://mp.weixin.qq.com"
self.headers["Referer"] = "https://mp.weixin.qq.com/"
# 账号密码post的url
bizlogin_url = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=startlogin"
# 获取二维码的url
qrcode_url = "https://mp.weixin.qq.com/cgi-bin/loginqrcode?action=getqrcode¶m=4300&rd=928"
# 账号密码登录,获取二维码,等待用户扫描二维码,需手动关闭二维码窗口
self.s.post(bizlogin_url, headers=self.headers, data=data)
img = self.s.get(qrcode_url)
self.__save_login_qrcode(img)
# 去除之后不用的headers的key
self.headers.pop("Host")
self.headers.pop("Origin")
# 获取token
self.__login_official(username, password)
def __login_official(self, username, password):
"""
登录微信公众号平台,获取token
Parameters
----------
username: str
用户账号
password: str
用户密码
"""
# 设定headers的referer的请求
referer = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=validate&lang=zh_CN&account={}".format(
username
)
self.headers["Referer"] = referer
# 获取token的data
data = {
"userlang": "zh_CN",
"token": "",
"lang": "zh_CN",
"f": "json",
"ajax": "1",
}
# 获取token的url
bizlogin_url = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=login"
res = self.s.post(
bizlogin_url, data=data, headers=self.headers, proxies=self.proxies
).json()
try:
# 截取字符串中的token参数
token = res["redirect_url"].split("=")[-1]
self.params["token"] = token
# self.__save_cookie(username)
self.headers.pop("Referer")
except Exception:
# 获取token失败,重新扫码登录
print("please try again")
self.__startlogin_official(username, password)
[文档] def official_info(self, nickname, begin=0, count=5):
"""
根据关键词返回相关公众号的信息
Parameters
----------
nickname : str
需要爬取公众号名称
begin: str or int
起始爬取的页数
count: str or int
每次爬取的数量,1-5
Returns
-------
list:
相关公众号的对应信息::
[
{
'alias': 公众号别名,
'fakeid': 公众号唯一id,
'nickname': 公众号名称,
'round_head_img': 公众号头像的url,
'service_type': 1公众号性质
},
...
]
"""
self.__verify_str(nickname, "nickname")
# 搜索公众号的url
search_url = "https://mp.weixin.qq.com/cgi-bin/searchbiz"
# 增加/更改请求参数
params = {
"query": nickname,
"count": str(count),
"action": "search_biz",
"ajax": "1",
"begin": str(begin),
}
self.params.update(params)
try:
# 返回与输入公众号名称最接近的公众号信息
official = self.s.get(
search_url,
headers=self.headers,
params=self.params,
proxies=self.proxies,
)
return official.json()["list"]
except Exception:
raise Exception(u"公众号名称错误或cookie、token错误,请重新输入")
[文档] def articles_nums(self, nickname):
"""
获取公众号的总共发布的文章数量
Parameters
----------
nickname : str
需要爬取公众号名称
Returns
-------
int
文章总数
"""
self.__verify_str(nickname, "nickname")
try:
return self.__get_articles_data(nickname, begin="0")["app_msg_cnt"]
except Exception:
raise Exception(u"公众号名称错误或cookie、token错误,请重新输入")
[文档] def get_urls(self, nickname, begin=0, count=5):
"""
获取公众号的每页的文章信息
Parameters
----------
nickname : str
需要爬取公众号名称
begin: str or int
起始爬取的页数
count: str or int
每次爬取的数量,1-5
Returns
-------
list:
由每个文章信息构成的数组::
[
{
'aid': '2650949647_1',
'appmsgid': 2650949647,
'cover': 封面的url'digest': 文章摘要,
'itemidx': 1,
'link': 文章的url,
'title': 文章标题,
'update_time': 更新文章的时间戳
},
]
如果list为空则说明没有相关文章
"""
self.__verify_str(nickname, "nickname")
try:
return self.__get_articles_data(
nickname, begin=str(begin), count=str(count)
)["app_msg_list"]
except Exception:
raise Exception(u"公众号名称错误或cookie、token错误,请重新输入")
[文档] def latest_articles(self, biz):
"""
获取公众号的最新页的文章信息
Parameters
----------
biz : str
公众号的biz
Returns
-------
list:
由每个文章信息构成的数组::
[
{
'aid': '2650949647_1',
'appmsgid': 2650949647,
'cover': 封面的url'digest': 文章摘要,
'itemidx': 1,
'link': 文章的url,
'title': 文章标题,
'update_time': 更新文章的时间戳
},
]
如果list为空则说明没有相关文章
"""
try:
return self.__get_articles_data("", begin="0", biz=biz)["app_msg_list"]
except Exception:
raise Exception(u"公众号名称错误或cookie、token错误,请重新输入")
def __get_articles_data(
self,
nickname,
begin,
biz=None,
count=5,
type_="9",
action="list_ex",
query=None,
):
"""
Parameters
----------
nickname : str
需要爬取公众号名称
begin: str or int
起始爬取的页数
biz : str
公众号的biz
count: str or int
每次爬取的数量,1-5
type_: str or int
获取数据的方式,暂不知道具体用途
action: str
请求之后的行为动作,"list_ex"获取文章信息的json
Returns
-------
json:
文章信息的json::
{
'app_msg_cnt': 公众号发文章总数,
'app_msg_list': 一个数组(参看get_articles函数),
'base_resp': {
'err_msg': 'ok',
'ret': 0
}
}
"""
# 获取文章信息的url
appmsg_url = "https://mp.weixin.qq.com/cgi-bin/appmsg"
try:
if nickname != "":
# 获取公众号的fakeid
official_info = self.official_info(nickname)
self.params["fakeid"] = official_info[0]["fakeid"]
elif biz != None:
self.params["fakeid"] = biz
else:
raise Exception(u"请输入biz或者nickname")
except Exception:
raise Exception(u"公众号名称错误或cookie、token错误,请重新输入")
# 增加/更改请求参数
params = {
"query": query if query != None else "",
"begin": str(begin),
"count": str(count),
"type": str(type_),
"action": action,
}
self.params.update(params)
data = self.s.get(
appmsg_url, headers=self.headers, params=self.params, proxies=self.proxies
)
return data.json()
[文档]class PC(object):
"""通过PC端的微信,获取需要爬取的微信公众号的推文链接"""
def __init__(self, biz, uin, cookie, proxies={"http": None, "https": None}):
"""
Parameters
----------
__biz: str
需要爬取公众号的id
uin: str
用户id
cookies : str
登录微信后获取的cookie
"""
self.s = requests.session()
self.__biz = biz
self.uin = uin
self.headers = {"Cookies": cookie}
self.proxies = proxies
[文档] def get_urls(self, key, offset="0"):
"""
Parameters
----------
key: str
个人微信号登陆后获取的key
offset: str or int
获取起始的页数,从0开始,每次递增10(可以大于10,但是不好确认参数,所以递增10,之后再去重)
Returns
----------
list:
由每个文章信息构成的数组,主要获取的参数`item['app_msg_ext_info']['content_url']`, `item['app_msg_ext_info']['title']`, `item['comm_msg_info']['datetime']`::
import html
消除转义 html.unescape(html.unescape(url)); eval(repr(url).replace('\\', ''))
[
{
'app_msg_ext_info': {
'audio_fileid': 0,
'author': '',
'content': '',
'content_url': 文章url,存在转义符'/'需要去除,
'copyright_stat': 100,
'cover': 文章封面url,存在转义符'/'需要去除,
'del_flag': 1,
'digest': '',
'duration': 0,
'fileid': 0,
'is_multi': 0,
'item_show_type': 8,
'malicious_content_type': 0,
'malicious_title_reason_id': 0,
'multi_app_msg_item_list': [],
'play_url': '',
'source_url': '',
'subtype': 9,
'title': 文章标题
},
'comm_msg_info': {
'content': '',
'datetime': 1536930840,
'fakeid': '2394588245',
'id': 1000000262,
'status': 2,
'type': 49
}
}
]
"""
self.params = {
"action": "getmsg",
"__biz": self.__biz,
"f": "json",
"offset": str(offset),
"count": "10",
"uin": self.uin,
"key": key,
}
origin_url = "https://mp.weixin.qq.com/mp/profile_ext"
msg_json = self.s.get(
origin_url, params=self.params, headers=self.headers, proxies=self.proxies
).json()
if "general_msg_list" in msg_json.keys():
lst = [
item
for item in eval(msg_json["general_msg_list"])["list"]
if "app_msg_ext_info" in item.keys()
]
return lst
raise Exception(
"Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key"
)
[文档]class Mobile(object):
"""通过移动端的wechat,获取需要爬取的微信公众号的推文链接"""
def __init__(self, biz, cookie):
"""
Parameters
----------
__biz: str
需要爬取公众号的id
cookie : str
登录微信后获取的cookie
"""
self.s = requests.session()
self.__biz = biz
self.headers = {
"User-Agent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile",
"Cookie": cookie,
}
[文档] def get_urls(self, appmsg_token, offset="0"):
"""
Parameters
----------
appmsg_token: str
个人微信号登陆后获取的token
offset: str or int
获取起始的页数,从0开始,每次递增10(可以大于10,但是不好确认参数,所以递增10,之后再去重)
Returns
----------
list:
由每个文章信息构成的数组::
[
{
'app_msg_ext_info': {
'audio_fileid': 0,
'author': '',
'content': '',
'content_url': 文章url,存在转义符'/'需要去除,
'copyright_stat': 100,
'cover': 文章封面url,存在转义符'/'需要去除,
'del_flag': 1,
'digest': '',
'duration': 0,
'fileid': 0,
'is_multi': 0,
'item_show_type': 8,
'malicious_content_type': 0,
'malicious_title_reason_id': 0,
'multi_app_msg_item_list': [],
'play_url': '',
'source_url': '',
'subtype': 9,
'title': 文章标题
},
'comm_msg_info': {
'content': '',
'datetime': 1536930840,
'fakeid': '2394588245',
'id': 1000000262,
'status': 2,
'type': 49
}
}
]
"""
self.params = {
"action": "getmsg",
"__biz": self.__biz,
"f": "json",
"offset": str(offset),
"count": "10",
"appmsg_token": appmsg_token,
}
origin_url = "https://mp.weixin.qq.com/mp/profile_ext"
msg_json = self.s.get(
origin_url, params=self.params, headers=self.headers, proxies=self.proxies
).json()
if "general_msg_list" in msg_json.keys():
lst = [
item
for item in eval(msg_json["general_msg_list"])["list"]
if "app_msg_ext_info" in item.keys()
]
return lst
raise Exception(
"Failure:\n1.params is error, please check your params\n2.key is lose efficacy, please update your key"
)
[文档]class WeBook(object):
"""
通过微信读书,获取需要爬取的微信公众号的推文链接
vid是固定的微信账号
skey是变动的
需要注意抓取时间,如每次抓取暂停50s,则可以抓取51次左右。
该接口也不会封禁(暂时来看),如果被判定频繁,只需要在移动端进行滑动验证码,则可以继续抓取
"""
def __init__(
self, skey, vid, user_agent=None, proxies={"http": None, "https": None}
):
self.s = requests.session()
self.base_url = "https://i.weread.qq.com/book/articles?bookId=MP_WXS_{}&count=20&offset={}&synckey={}"
user_agent = (
"WeRead/5.3.4 (iPhone; iOS 14.1; Scale/2.00)"
if user_agent == None
else user_agent
)
self.headers = {
"User-Agent": user_agent,
"Cookies": "wr_logined=1",
"skey": skey,
"vid": vid,
}
self.proxies = proxies
[文档] def get_urls(self, bookid, offset="0"):
url = self.base_url.format(bookid, offset, str(time.time()).split(".")[0])
res = self.s.get(url, headers=self.headers, proxies=self.proxies)
if "reviews" in res.json():
item_lst = res.json()["reviews"]
return [item["review"] for item in item_lst]
else:
print(res.json())
return []