diff --git a/专辑评论.py b/专辑评论.py new file mode 100644 index 0000000..ad71ec6 --- /dev/null +++ b/专辑评论.py @@ -0,0 +1,114 @@ +import time +import pandas as pd +from lxml import html +import requests + + +etree = html.etree +headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203', +} +# url = 'https://www.ximalaya.com/revision/rank/v4/element?rankingId=100364' +url = 'https://www.ximalaya.com/revision/rank/v4/element?rankingId=100191' + +# 获取排行榜中的专辑链接列表 +def get_albums_url(url): + albums = requests.request("GET", url, headers=headers) + albums_dict = albums.json() + albums_list = albums_dict.get("data").get("rankList")[0].get("ids") + + return albums_list + +def get_url(albums_list): + url_list = [] + t = round(time.time() * 1000) + for i in albums_list: + albums_url = "https://mobile.ximalaya.com/album-comment-mobile/web/album/comment/list/query/{}?albumId={}&order=content-score-desc".format(t, i) + url_list.append(albums_url) + + return url_list + +def get_albums_conmments(url_list, pagesize): + comments_list = [] + for url in url_list: + data_url = url + "&pageId=1&pageSize=10" + print('现在处理的是{}'.format(url)) + data = requests.request("GET", data_url, headers=headers) + data = data.json() + comments_count = data.get("data").get("comments").get("totalCount") + + pages = comments_count // pagesize + end_pagesize = comments_count % pagesize + + if pages <= 60: + if end_pagesize == 0: # 如果专辑集数与分页数据的余数为0,则表示可以整除,分页数整除即可 + for i in range(pages): # 遍历分页数量,构建所需的专辑url + print('现在处理的是第{}页'.format(i)) + comment_url = url + "&page={}&pageSize={}".format( + i + 1, pagesize) + # print('正在处理{}的page'.format(trackid, i + 1)) + comment_data = requests.request("GET", comment_url, headers=headers) + comment_data = comment_data.json() + comment_list = comment_data.get("data").get("comments").get("list") + for j in range(len(comment_list)): + comments = comment_list[j].get("content") + comments_list.append(comments) + + else: + end_page = pages + 1 + for i in range(pages): # 遍历分页数量,构建所需的专辑url + print('现在处理的是第{}页'.format(i)) + comment_url = url + "&page={}&pageSize={}".format( + i + 1, pagesize) + # print('正在处理{}的page'.format(trackid, i + 1)) + comment_data = requests.request("GET", comment_url, headers=headers) + comment_data = comment_data.json() + comment_list = comment_data.get("data").get("comments").get("list") + for j in range(len(comment_list)): + comments = comment_list[j].get("content") + comments_list.append(comments) + + comment_url = url + "&page={}&pageSize={}".format( + end_page, end_pagesize) + # print('正在处理{}的page'.format(trackid, i + 1)) + comment_data = requests.request("GET", comment_url, headers=headers) + comment_data = comment_data.json() + comment_list = comment_data.get("data").get("comments").get("list") + for j in range(len(comment_list)): + comments = comment_list[j].get("content") + comments_list.append(comments) + + else: + for i in range(60): # 遍历分页数量,构建所需的专辑url + comment_url = url + "&page={}&pageSize={}".format( + i + 1, pagesize) + print('现在处理的是第{}页'.format(i)) + comment_data = requests.request("GET", comment_url, headers=headers) + comment_data = comment_data.json() + comment_list = comment_data.get("data").get("comments").get("list") + for j in range(len(comment_list)): + comments = comment_list[j].get("content") + comments_list.append(comments) + + return comments_list + +def save_pic(img_src, img_name): + # path = r"C:\Users\Administrator\Pictures\xima" + "\\" + img_name + ".jpg" + path = r"C:\Users\Cori\Downloads\喜马图片" + "\\" + img_name + ".jpg" + try: + r = requests.request("GET", img_src, headers=headers) # 获取网页内容,将伪装代理赋给网页头 + f = open(path, 'wb') # 打开一个二进制文件(图片) + f.write(r.content) # 导入一个二进制文件 + f.close() # 关闭写入的二进制文件(图片) + except Exception as e: + print(e) + + + +a = get_albums_url(url) +b = get_url(a) +c = get_albums_conmments(b, 20) +data = pd.DataFrame(c) +data.to_excel('专辑评论.xlsx') +print(c) +