wechatarticles.ArticlesInfo 源代码

# coding:  utf-8
import re

import requests
from bs4 import BeautifulSoup as bs


[文档]class ArticlesInfo(object): """登录WeChat,获取更加详细的推文信息。如点赞数、阅读数、评论等""" def __init__(self, appmsg_token, cookie, proxies={"http": None, "https": None}): """ 初始化参数 Parameters ---------- cookie: str 点开微信公众号文章抓包工具获取的cookie appmsg_token: str 点开微信公众号文章抓包工具获取的appmsg_token """ self.s = requests.session() self.s.trust_env = False self.appmsg_token = appmsg_token self.headers = { "User-Agent": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0Chrome/57.0.2987.132 MQQBrowser/6.2 Mobile", "Cookie": cookie, } self.data = { "is_only_read": "1", "is_temp_url": "0", "appmsg_type": "9", # 新参数,不加入无法获取like_num } self.proxies = proxies def __verify_url(self, article_url): """ 简单验证文章url是否符合要求 Parameters ---------- article_url: str 文章链接 """ verify_lst = ["mp.weixin.qq.com", "__biz", "mid", "sn", "idx"] for string in verify_lst: if string not in article_url: raise Exception("params is error, please check your article_url")
[文档] def read_like_nums(self, article_url): """ 获取阅读数和点赞数 Parameters ---------- article_url: str 文章链接 Returns ------- (int, int): 阅读数、点赞数 """ try: appmsgstat = self.__get_appmsgext(article_url)["appmsgstat"] return ( appmsgstat["read_num"], appmsgstat["like_num"], appmsgstat["old_like_num"], ) except Exception: raise Exception("params is error, please check your article_url")
[文档] def comments(self, article_url): """ 获取文章评论 Parameters ---------- article_url: str 文章链接 Returns ------- json:: { "base_resp": { "errmsg": "ok", "ret": 0 }, "elected_comment": [ { "content": 用户评论文字, "content_id": "6846263421277569047", "create_time": 1520098511, "id": 3, "is_from_friend": 0, "is_from_me": 0, "is_top": 0, 是否被置顶 "like_id": 10001, "like_num": 3, "like_status": 0, "logo_url": "http://wx.qlogo.cn/mmhead/OibRNdtlJdkFLMHYLMR92Lvq0PicDpJpbnaicP3Z6kVcCicLPVjCWbAA9w/132", "my_id": 23, "nick_name": 评论用户的名字, "reply": { "reply_list": [ ] } } ], "elected_comment_total_cnt": 3, 评论总数 "enabled": 1, "friend_comment": [ ], "is_fans": 1, "logo_url": "http://wx.qlogo.cn/mmhead/Q3auHgzwzM6GAic0FAHOu9Gtv5lEu5kUqO6y6EjEFjAhuhUNIS7Y2AQ/132", "my_comment": [ ], "nick_name": 当前用户名, "only_fans_can_comment": false } """ __biz, _, idx, _ = self.__get_params(article_url) getcomment_url = "https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&__biz={}&idx={}&comment_id={}&limit=100" try: url = getcomment_url.format(__biz, idx, self.__get_comment_id(article_url)) comment_json = self.s.get( url, headers=self.headers, proxies=self.proxies ).json() except Exception as e: print(e) comment_json = {} return comment_json
def __get_comment_id(self, article_url): """ 获取comment_id Parameters ---------- article_url: str 文章链接 Returns ------- str: comment_id获取评论必要参数 """ res = self.s.get(article_url, data=self.data, proxies=self.proxies) # 使用正则提取comment_id comment_id = re.findall(r'comment_id = "\d+"', res.text)[0].split(" ")[-1][1:-1] return comment_id def __get_params(self, article_url): """ 解析文章url, 获取必要的请求参数 Parameters ---------- article_url: str 文章链接 Returns ------- (str, str, str, str): __biz, mid, idx, sn """ # 简单验证文章的url是否正确 self.__verify_url(article_url) # 切分url, 提取相应的参数 string_lst = article_url.split("?")[1].split("&") dict_value = [string[string.index("=") + 1 :] for string in string_lst] __biz, mid, idx, sn, *_ = dict_value sn = sn[:-3] if sn[-3] == "#" else sn return __biz, mid, idx, sn def __get_appmsgext(self, article_url): """ 获取每篇文章具体信息 Parameters ---------- article_url: str 文章链接 Returns ------- json: 文章具体信息的json:: { 'advertisement_info': [], 'advertisement_num': 0, 'appmsgstat': {'is_login': True, 'like_num': 12, 'liked': False, 'read_num': 288, 'real_read_num': 0, 'ret': 0, 'show': True}, 'base_resp': {'wxtoken': 2045685972}, 'reward_head_imgs': [] } """ __biz, mid, idx, sn = self.__get_params(article_url) # 将params参数换到data中请求。这一步貌似不换也行 origin_url = "https://mp.weixin.qq.com/mp/getappmsgext?" appmsgext_url = origin_url + "appmsg_token={}&x5=0".format(self.appmsg_token) self.data["__biz"] = __biz self.data["mid"] = mid self.data["sn"] = sn self.data["idx"] = idx # appmsgext_url = origin_url + "__biz={}&mid={}&sn={}&idx={}&appmsg_token={}&x5=1".format( # __biz, mid, sn, idx, self.appmsg_token) appmsgext_json = requests.post( appmsgext_url, headers=self.headers, data=self.data, proxies=self.proxies ).json() if "appmsgstat" not in appmsgext_json.keys(): raise Exception("get info error, please check your cookie and appmsg_token") return appmsgext_json
[文档] def content(self, url): html_text = self.s.get( url.strip(), headers=self.headers, proxies=self.proxies ).text soup = bs(html_text, "lxml") ctext = "你的访问过于频繁,需要从微信打开验证身份,是否需要继续访问当前页面" if ctext in html_text: raise SystemError("访问频繁!") # js加载 # html.text.split('var content = ')[1].split('var')[0].strip() # soup.find(id="js_panel_like_title").text try: body = soup.find(class_="rich_media_area_primary_inner") content_p = body.find(class_="rich_media_content") if content_p: imgs = body.find_all("img") return content_p.text.strip(), len(content_p.text.strip()), len(imgs) else: content_p = soup.find(id="js_panel_like_title").text.strip() return content_p, len(content_p), 0 except: return "", 0, 0