热门推荐
wanyiyun_网络流行_网易云爬虫数据分析_
2024-11-01 14:22
import glob

wanyiyun_网络流行_网易云爬虫数据分析_

import requests import jieba.analyse import os import re import lxml import json import heapq from lxml import etree from pyecharts.charts import Bar from pyecharts import options as opts from snownlp import SnowNLP # import matplotlib.pyplot as plt # from pyecharts import Bar class WanYiYun: def GetAlbum(self): # 周杰伦网易云专辑网址 urls = "http://music.163.com/artist/album?id=6452&limit=100&offset=0" # header信息 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Connection': 'keep-alive', 'cookie': '_iuqxldmzr_=32; _ntes_nnid=dc7dbed33626ab3af002944fabe23bc4,1524151830800; _ntes_nuid=dc7dbed33626ab3af002944fabe23bc4; __utmc=94650624; __utmz=94650624.1524151831.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=94650624.1505452853.1524151831.1524151831.1524176140.2; WM_TID=RpKJQQ90pzUSYfuSWgFDY6QEK1Gb4Ulg; JSESSIONID-WYYY=ZBmSOShrk4UKH5K%5CVasEPuc0b%2Fq6m5eAE91jWCmD6UpdB2y4vbeazO%2FpQK%5CgiBW0MUDDWfB1EuNaV5c4wIJZ08hYQKDhpsHnDeMAgoz98dt%2B%2BFfhdiiNJw9Y9vRR5S4GU%2FziFp%2BliFX1QTJj%2BbaIGD3YxVzgumklAwJ0uBe%2FcGT6VeQW%3A1524179765762; __utmb=94650624.24.10.1524176140', 'Referer': 'https://music.163.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36' html_data = html1.xpath('//div[@class="u-cover u-cover-alb3"]')[0] pattern = re.compile(r'<div class="u-cover u-cover-alb3" title=(.*?)>') items = re.findall(pattern, html.text) cal = 0 # 首先删除这个文件,要不然每次都是追加 if os.path.exists("E:/大三/下学期/python/大作业/专辑信息.txt"): os.remove("E:/大三/下学期/python/大作业/专辑信息.txt") # 删除文件避免每次都要重复写入 if os.path.exists("E:/大三/下学期/python/大作业/专辑歌曲信息.txt"): os.remove("E:/大三/下学期/python/大作业/专辑歌曲信息.txt") # 循环获取专辑信息 for i in items: cal += 1 # 这里需要注意i是有双引号的,所以需要注意转换下 p = i.replace('"', '') # 这里在匹配里面使用了字符串 pattern1 = re.compile(r'<a href=https://download.csdn.net/download/weixin_42679995/"/album?id=(.*?)" class="tit s-fc0">%s</a>' % (p)) id1 = re.findall(pattern1, html.text) # print("专辑的名字是:%s!!专辑的ID是%s:"%(i,items1)) with open("E:/大三/下学期/python/大作业/专辑信息.txt", 'a') as f: f.write("专辑的名字是:%s!!专辑的ID是%s :" % (i, id1)) self.GetLyric1(i, id1) # print("总数是%d"%(cal)) print("获取专辑以及专辑ID成功!!!!!") def GetLyric1(self, album, id1): urls1 = "http://music.163.com/#/album?id=" # 专辑id urls2 = str(id1) urls3 = urls1 + urls2 # 将不要需要的符号去掉 urls = urls3.replace("[", "").replace("]", "").replace("'", "").replace("#/", "") 'cookie': '_iuqxldmzr_=32; _ntes_nnid=dc7dbed33626ab3af002944fabe23bc4,1524151830800; _ntes_nuid=dc7dbed33626ab3af002944fabe23bc4; __utmz=94650624.1524151831.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utma=94650624.1505452853.1524151831.1524176140.1524296365.3; __utmc=94650624; WM_TID=RpKJQQ90pzUSYfuSWgFDY6QEK1Gb4Ulg; JSESSIONID-WYYY=7t6F3r9Uzy8uEXHPnVnWTXRP%5CSXg9U3%5CN8V5AROB6BIe%2B4ie5ch%2FPY8fc0WV%2BIA2ya%5CyY5HUBc6Pzh0D5cgpb6fUbRKMzMA%2BmIzzBcxPcEJE5voa%2FHA8H7TWUzvaIt%2FZnA%5CjVghKzoQXNM0bcm%2FBHkGwaOHAadGDnthIqngoYQsNKQQj%3A1524299905306; __utmb=94650624.21.10.1524296365', 'Referer': 'http://music.163.com/', html_data = html1.xpath('//ul[@class="f-hide"]//a') for i in html_data: # 注意这个用法 html_data1 = i.xpath('string(.)') # 获取歌曲的id html_data2 = str(html_data1) pattern1 = re.compile(r'<li><a href=https://download.csdn.net/download/weixin_42679995/"/song?id=(d+?)">%s</a></li>' % (html_data2)) items = re.findall(pattern1, html.text) # print("歌曲的名称为: %s"%(html_data2)) # print("歌曲的id为: %s"%(items)) with open("E:/大三/下学期/python/大作业/专辑歌曲信息.txt", 'a') as f: print(len(items)) if len(items) > 0: f.write("歌曲的名字是: %s!!歌曲的ID是%s " % (html_data2, items)) print("获取歌曲 %s 以及歌曲的ID %s写入文件成功" % (html_data2, items)) # http://music.163.com/#/song?id=185617 # if(len()) def GetLyric2(self): # 首先删除原来的文件,避免重复写入 for i in glob.glob("E:/大三/下学期/python/大作业/*热评*"): for i in glob.glob("E:/大三/下学期/python/大作业/*歌曲名*"): # 直接读取所有内容 file_object = open("E:/大三/下学期/python/大作业/专辑歌曲信息.txt", ) list_of_line = file_object.readlines() aaa = 1 namelist = "" for i in list_of_line: # 歌曲的名字是: 同一种调调!!歌曲的ID是['186020'] pattern1 = re.compile(r'歌曲的名字是: (.*?)!!歌曲的ID是') pattern2 = re.compile(r'歌曲的ID是[(.*?)]') items1 = str(re.findall(pattern1, i)).replace("[", "").replace("]", "").replace("'", "") items2 = str(re.findall(pattern2, i)).replace("[", "").replace("]", "").replace('"', "").replace("'", "") 'Request URL': 'http://music.163.com/weapi/song/lyric?csrf_token=', 'Request Method': 'POST', 'Status Code': '200 OK', 'Remote Address': '59.111.160.195:80', 'Referrer Policy': 'no-referrer-when-downgrade', # http://music.163.com/api/song/lyric?id=186017&lv=1&kv=1&tv=-1 urls = "http://music.163.com/api/song/lyric?" + "id=" + str(items2) + '&lv=1&kv=1&tv=-1' # urls = "http://music.163.com/api/song/lyric?id=186018&lv=1&kv=1&tv=-1" # print(urls) json_obj = html.text j = json.loads(json_obj) # try: lrc = j['lrc']['lyric'] pat = re.compile(r'[.*]') lrc = re.sub(pat, "", lrc) lrc = lrc.strip() print(lrc) lrc = str(lrc) with open("E:/大三/下学期/python/大作业/歌曲名-" + items1 + ".txt", 'w', encoding='utf-8') as f: f.write(lrc) aaa += 1    以上就是本篇文章【wanyiyun_网络流行_网易云爬虫数据分析_】的全部内容了,欢迎阅览 ! 文章地址:http://fabua.ksxb.net/quote/103.html 
     行业      资讯      企业新闻      行情      企业黄页      同类资讯      网站地图      返回首页 迅博思语资讯移动站 http://mip.ksxb.net/ , 查看更多