115 lines
5.0 KiB
Python
115 lines
5.0 KiB
Python
|
import time
|
||
|
import pandas as pd
|
||
|
from lxml import html
|
||
|
import requests
|
||
|
|
||
|
|
||
|
etree = html.etree
|
||
|
headers = {
|
||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.203',
|
||
|
}
|
||
|
# url = 'https://www.ximalaya.com/revision/rank/v4/element?rankingId=100364'
|
||
|
url = 'https://www.ximalaya.com/revision/rank/v4/element?rankingId=100191'
|
||
|
|
||
|
# 获取排行榜中的专辑链接列表
|
||
|
def get_albums_url(url):
|
||
|
albums = requests.request("GET", url, headers=headers)
|
||
|
albums_dict = albums.json()
|
||
|
albums_list = albums_dict.get("data").get("rankList")[0].get("ids")
|
||
|
|
||
|
return albums_list
|
||
|
|
||
|
def get_url(albums_list):
|
||
|
url_list = []
|
||
|
t = round(time.time() * 1000)
|
||
|
for i in albums_list:
|
||
|
albums_url = "https://mobile.ximalaya.com/album-comment-mobile/web/album/comment/list/query/{}?albumId={}&order=content-score-desc".format(t, i)
|
||
|
url_list.append(albums_url)
|
||
|
|
||
|
return url_list
|
||
|
|
||
|
def get_albums_conmments(url_list, pagesize):
|
||
|
comments_list = []
|
||
|
for url in url_list:
|
||
|
data_url = url + "&pageId=1&pageSize=10"
|
||
|
print('现在处理的是{}'.format(url))
|
||
|
data = requests.request("GET", data_url, headers=headers)
|
||
|
data = data.json()
|
||
|
comments_count = data.get("data").get("comments").get("totalCount")
|
||
|
|
||
|
pages = comments_count // pagesize
|
||
|
end_pagesize = comments_count % pagesize
|
||
|
|
||
|
if pages <= 60:
|
||
|
if end_pagesize == 0: # 如果专辑集数与分页数据的余数为0,则表示可以整除,分页数整除即可
|
||
|
for i in range(pages): # 遍历分页数量,构建所需的专辑url
|
||
|
print('现在处理的是第{}页'.format(i))
|
||
|
comment_url = url + "&page={}&pageSize={}".format(
|
||
|
i + 1, pagesize)
|
||
|
# print('正在处理{}的page'.format(trackid, i + 1))
|
||
|
comment_data = requests.request("GET", comment_url, headers=headers)
|
||
|
comment_data = comment_data.json()
|
||
|
comment_list = comment_data.get("data").get("comments").get("list")
|
||
|
for j in range(len(comment_list)):
|
||
|
comments = comment_list[j].get("content")
|
||
|
comments_list.append(comments)
|
||
|
|
||
|
else:
|
||
|
end_page = pages + 1
|
||
|
for i in range(pages): # 遍历分页数量,构建所需的专辑url
|
||
|
print('现在处理的是第{}页'.format(i))
|
||
|
comment_url = url + "&page={}&pageSize={}".format(
|
||
|
i + 1, pagesize)
|
||
|
# print('正在处理{}的page'.format(trackid, i + 1))
|
||
|
comment_data = requests.request("GET", comment_url, headers=headers)
|
||
|
comment_data = comment_data.json()
|
||
|
comment_list = comment_data.get("data").get("comments").get("list")
|
||
|
for j in range(len(comment_list)):
|
||
|
comments = comment_list[j].get("content")
|
||
|
comments_list.append(comments)
|
||
|
|
||
|
comment_url = url + "&page={}&pageSize={}".format(
|
||
|
end_page, end_pagesize)
|
||
|
# print('正在处理{}的page'.format(trackid, i + 1))
|
||
|
comment_data = requests.request("GET", comment_url, headers=headers)
|
||
|
comment_data = comment_data.json()
|
||
|
comment_list = comment_data.get("data").get("comments").get("list")
|
||
|
for j in range(len(comment_list)):
|
||
|
comments = comment_list[j].get("content")
|
||
|
comments_list.append(comments)
|
||
|
|
||
|
else:
|
||
|
for i in range(60): # 遍历分页数量,构建所需的专辑url
|
||
|
comment_url = url + "&page={}&pageSize={}".format(
|
||
|
i + 1, pagesize)
|
||
|
print('现在处理的是第{}页'.format(i))
|
||
|
comment_data = requests.request("GET", comment_url, headers=headers)
|
||
|
comment_data = comment_data.json()
|
||
|
comment_list = comment_data.get("data").get("comments").get("list")
|
||
|
for j in range(len(comment_list)):
|
||
|
comments = comment_list[j].get("content")
|
||
|
comments_list.append(comments)
|
||
|
|
||
|
return comments_list
|
||
|
|
||
|
def save_pic(img_src, img_name):
|
||
|
# path = r"C:\Users\Administrator\Pictures\xima" + "\\" + img_name + ".jpg"
|
||
|
path = r"C:\Users\Cori\Downloads\喜马图片" + "\\" + img_name + ".jpg"
|
||
|
try:
|
||
|
r = requests.request("GET", img_src, headers=headers) # 获取网页内容,将伪装代理赋给网页头
|
||
|
f = open(path, 'wb') # 打开一个二进制文件(图片)
|
||
|
f.write(r.content) # 导入一个二进制文件
|
||
|
f.close() # 关闭写入的二进制文件(图片)
|
||
|
except Exception as e:
|
||
|
print(e)
|
||
|
|
||
|
|
||
|
|
||
|
a = get_albums_url(url)
|
||
|
b = get_url(a)
|
||
|
c = get_albums_conmments(b, 20)
|
||
|
data = pd.DataFrame(c)
|
||
|
data.to_excel('专辑评论.xlsx')
|
||
|
print(c)
|
||
|
|