原创

python爬虫实战项目


python爬虫实战项目

1. LOL所有英雄皮肤下载

from fake_useragent import UserAgent
import requests, json, os

# 爬取网页所有英雄的皮肤图片
# https://lol.qq.com/data/info-heros.shtml


# 获取英雄id
def get_heroList():
    url = 'https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js'
    headers = {
        'User-Agent': UserAgent().chrome
    }
    try:
        response = requests.get(url, headers=headers)
        # print(response.text)
        # print(type(response.text))
        response = json.loads(response.text)
        # print(type(response))
        hero_ids = []
        for i in response['hero']:
            hero_ids.append(i['heroId'])
        # print(hero_ids)
        return hero_ids
    except:
        print('获取英雄id失败')
        return None


# 根据英雄id获取英雄皮肤名称和图片下载地址
def get_skinNames(id):
    url = 'https://game.gtimg.cn/images/lol/act/img/js/hero/{}.js'.format(id)
    headers = {
        'User-Agent': UserAgent().chrome
    }
    try:
        response = requests.get(url, headers=headers)
        response = json.loads(response.text)
        skinnames = []
        skin_urls = []
        for i in response['skins'][:-1]:
            if i['mainImg'] != '':
                skinnames.append(i['name'])
                skin_urls.append(i['mainImg'])
        # print(skinnames)
        return skinnames, skin_urls
    except:
        print('获取英雄皮肤名称失败')
        return None


# 根据名称,下载图片保存文件夹
def downloadImg(skinnames, skin_urls):
    headers = {
        'User-Agent': UserAgent().chrome
    }
    filename = skinnames[0]
    os.makedirs(filename, exist_ok=True)
    for skinname, skin_url in zip(skinnames, skin_urls):
        try:
            response = requests.get(skin_url, headers=headers)
        except:
            print(skinname + ' 下载失败')
            return
        with open(filename+'/'+skinname.replace('/', '_') + '.jpg', 'wb') as f:
            f.write(response.content)
    # print(filename + ' 下载完成')


if __name__ == '__main__':
    hero_ids = get_heroList()
    i = 1
    for id in hero_ids:
        skinnames, skin_urls = get_skinNames(id)
        # print(skinnames[0]+':'+str(len(skin_urls))+'张')
        downloadImg(skinnames, skin_urls)
        print('\r下载进度:' + str(i) + '/' + str(len(hero_ids)), end='')
        i = i + 1

2. 音乐下载软件

import requests, json, re
from tkinter import Tk, Button, Entry, StringVar, Radiobutton, Frame
from tkinter import messagebox


# 说明:
# 爬取网站:https://music.zhuolin.wang/
# ajax异步请求
# 下载的歌曲在软件所在目录下


# 根据输入找到歌曲信息
def get_musicInfo(query, sourse):
    music_ids = []
    music_names = []
    music_singers = []
    url = 'https://music.zhuolin.wang/api.php?'
    data = {
        'types': 'search',
        'count': '5',
        'source': sourse,
        'pages': '1',
        'name': query
    }
    headers = {
        'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Content-Length': '37',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie': '',
        'Host': 'music.zhuolin.wang',
        'Origin': 'https://music.zhuolin.wang',
        'Referer': 'https://music.zhuolin.wang/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest'
    }
    response = requests.post(url, headers=headers, data=data)
    response = json.loads(response.text)
    for i in response:
        music_ids.append(i['id'])
        music_names.append(i['name'])
        music_singers.append(i['artist'])

    print(music_ids)
    print(music_names)
    print(music_singers)
    # return music_ids, music_names, music_singers


# 根据id获取歌曲下载链接
def get_downloadUrl(music_id, name, singer, sourse):
    url = 'https://music.zhuolin.wang/api.php?'
    data = {
        'types': 'url',
        'id': music_id,
        'source': sourse
    }
    headers = {
        'Accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
        'Accept-Encoding': 'gzip, deflate, br',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Connection': 'keep-alive',
        'Content-Length': '37',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie': '',
        'Host': 'music.zhuolin.wang',
        'Origin': 'https://music.zhuolin.wang',
        'Referer': 'https://music.zhuolin.wang/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest'
    }
    response = requests.post(url, data=data, headers=headers)
    print(response.text)
    downloadurl = re.search(r'http:(.+)",', response.text)
    if downloadurl != None:
        downloadurl = downloadurl.group().replace('\\', '')
        downloadMusic(downloadurl, name, singer)
    else:
        messagebox.showinfo('抱歉', '该歌曲暂不提供下载,请您更换其他平台下载')


# 下载歌曲到本地
def downloadMusic(url, name, singer):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        with open(name + '-' + singer + '.mp3', 'wb')as f:
            f.write(response.content)
        messagebox.showinfo('恭喜', name + '-' + singer + ' 下载完成')
    except:
        messagebox.showinfo('抱歉', name + ' 下载失败')


# 点击搜索执行
def search_music():
    query = entry.get()
    sourse = v.get()
    if query == '':
        messagebox.showinfo('提示', '请输入内容!')
        return False
    music_ids, music_names, music_singers = get_musicInfo(query, sourse)
    # 重新进行组件内容和按钮功能的设置
    for i in range(5):
        if i == 0:
            id1 = str(music_ids[i])
            name1 = str(music_names[i])
            singer1 = str(music_singers[i][0])
            value1.set(name1 + '   ' + singer1)
            entry1['textvariable'] = value1
            button1['command'] = lambda: download(id1, name1, singer1)
        if i == 1:
            id2 = str(music_ids[i])
            name2 = str(music_names[i])
            singer2 = str(music_singers[i][0])
            value2.set(name2 + '   ' + singer2)
            entry2['textvariable'] = value2
            button2['command'] = lambda: download(id2, name2, singer2)
        if i == 2:
            id3 = str(music_ids[i])
            name3 = str(music_names[i])
            singer3 = str(music_singers[i][0])
            value3.set(name3 + '   ' + singer3)
            entry3['textvariable'] = value3
            button3['command'] = lambda: download(id3, name3, singer3)
        if i == 3:
            id4 = str(music_ids[i])
            name4 = str(music_names[i])
            singer4 = str(music_singers[i][0])
            value4.set(name4 + '   ' + singer4)
            entry4['textvariable'] = value4
            button4['command'] = lambda: download(id4, name4, singer4)
        if i == 4:
            id5 = str(music_ids[i])
            name5 = str(music_names[i])
            singer5 = str(music_singers[i][0])
            value5.set(name5 + '   ' + singer5)
            entry5['textvariable'] = value5
            button5['command'] = lambda: download(id5, name5, singer5)


# 没有搜索之前点击下载按钮的提示
def tishi():
    messagebox.showinfo('提示', '请先进行搜索')


# 点击下载按钮执行(有点多余,可以去掉直接用get_downloadUrl)
def download(id, name, singer):
    sourse = v.get()
    get_downloadUrl(id, name, singer, sourse)


if __name__ == '__main__':
    # get_musicInfo('嘲笑声','tencent')
    # get_downloadUrl('0030tRLQ1e4mCn','嘲笑声','Big Daddy','tencent')

    root = Tk()
    win_width = root.winfo_screenwidth()
    win_height = root.winfo_screenheight()
    root.geometry('500x400+' + str(int(win_width / 2 - 250)) + '+' + str(int(win_height / 2 - 200)))
    root.minsize(500, 400)
    root.maxsize(500, 400)
    root.title('音乐下载器-敲出一片天')
    # get_downloadUrl('64561','单车(Live)','陈奕迅')

    query = StringVar()
    query.set('歌名+歌手更准确哦')

    # entry的参数:https://www.cnblogs.com/monsteryang/p/6575877.html

    entry = Entry(root, width=21, font=('隶书', 20), foreground='orange',
                  borderwidth=3, insertbackground='red', textvariable=query)
    entry.place(relx=0.05, rely=0.1)

    button = Button(root, width=8, text='搜索', font=('隶书', 18), bg='orange', fg='white', command=search_music)
    button.place(relx=0.7, rely=0.09)

    v = StringVar()
    v.set('netease')
    r1 = Radiobutton(text='网易', value='netease', font=('隶书', 18), fg='orange', variable=v)
    r2 = Radiobutton(text='qq', value='tencent', font=('隶书', 18), fg='orange', variable=v)
    r3 = Radiobutton(text='酷狗', value='kugou', font=('隶书', 18), fg='orange', variable=v)
    r4 = Radiobutton(text='百度', value='baidu', font=('隶书', 18), fg='orange', variable=v)
    r1.place(relx=0.08, rely=0.2)
    r2.place(relx=0.28, rely=0.2)
    r3.place(relx=0.48, rely=0.2)
    r4.place(relx=0.68, rely=0.2)

    frame = Frame(root, height=250, width=420, bd=1, relief="groove", bg='gray')
    frame.place(relx=0.06, rely=0.3)

    value1 = StringVar()
    entry1 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
                   borderwidth=3, textvariable=query)
    entry1.place(relx=0.05, rely=0.04)
    button1 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
    button1.place(relx=0.7, rely=0.04)

    value2 = StringVar()
    entry2 = Entry(frame, width=21, font=('隶书', 15), relief="flat", bg='gray',
                   borderwidth=3, textvariable=query)
    entry2.place(relx=0.05, rely=0.24)
    button2 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
    button2.place(relx=0.7, rely=0.24)

    value3 = StringVar()
    entry3 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
                   borderwidth=3, textvariable=query)
    entry3.place(relx=0.05, rely=0.44)
    button3 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
    button3.place(relx=0.7, rely=0.44)

    value4 = StringVar()
    entry4 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
                   borderwidth=3, textvariable=query)
    entry4.place(relx=0.05, rely=0.64)
    button4 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
    button4.place(relx=0.7, rely=0.64)

    value5 = StringVar()
    entry5 = Entry(frame, width=21, font=('隶书', 15), bg='gray', relief="flat",
                   borderwidth=3, textvariable=query)
    entry5.place(relx=0.05, rely=0.84)
    button5 = Button(frame, width=8, text='下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
    button5.place(relx=0.7, rely=0.84)

    root.mainloop()

3. b站视频下载

import requests
import re
import json
from tkinter import *
from tkinter import messagebox


# 获得播放页面代码,获取我们需要的数据,转为json数据
def get_html_one(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    }
    response = requests.get(url, headers=headers)
    try:
        title = re.findall(r'<title data-vue-meta="true">(.+)_.+</title>', response.text)
        response = re.search(r'"data":.+,"session"', response.text)
        text = response.group()
        text = json.loads(text[7:-10])
        video_url = text['dash']['video'][0]['baseUrl']
        audio_url = text['dash']['audio'][0]['baseUrl']
        return video_url, audio_url, title[0]
    except:
        print('该视频不支持下载')
        info.set('该视频不支持下载')
        messagebox.showinfo('提示', '该视频不支持下载')
        return None


# 下载合集
def get_html_more(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    }
    response = requests.get(url, headers=headers)
    title = re.findall(r'<title data-vue-meta="true">(.+)_.+</title>', response.text)
    video_title.set(title[0])
    response = re.search(r'window.__INITIAL_STATE__=.+;\(function', response.text)
    text = json.loads(response.group()[25:-10])
    cids = []
    names = []
    for info in text['videoData']['pages']:
        cids.append(str(info['cid']))
        names.append(info['part'])
    return cids, names


# 下载视频和音频到本地
def download_one(video_url, audio_url, title):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0',
        'Referer': 'https://www.bilibili.com/video/',
        'Origin': 'https://www.bilibili.com',
        'Accept': '*/*',
        'Accept-Encoding': 'gzip, deflate, sdch, br',
        'Accept-Language': 'zh-CN,zh;q=0.8'
    }
    print(title + ' 开始下载')

    try:
        video_response = requests.get(video_url, headers=headers)
        audio_response = requests.get(audio_url, headers=headers)
        with open(title + '.mp4', 'wb') as f:
            f.write(video_response.content)
        with open(title + '.mp3', 'wb') as f:
            f.write(audio_response.content)
    except:
        print(title + ' 下载失败')
        info.set(title + ' 下载失败')
        messagebox.showinfo('抱歉', title + ' 下载失败')
        return
    print(title + ' 下载完成')
    info.set(title + ' 下载完成')
    messagebox.showinfo('恭喜', title + ' 下载完成')


# 下载合集
def download_more(cids, names, url):
    number = len(cids)
    for i in range(number):
        url = url + '?p{}'.format(i + 1)
        video_url, audio_url, title = get_html_one(url)
        download_one(video_url, audio_url, names[i])
        print('=========================================')


# 点击搜索
def serach():
    button1.config(state="active")
    baseurl = 'https://www.bilibili.com/video/{}'
    video_id = entry.get()
    url = baseurl.format(video_id)
    flag = v.get()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
    }
    try:
        response = requests.get(url, headers=headers)
        title = re.findall(r'<title data-vue-meta="true">(.+)_.+</title>', response.text)
        if title[0] == '视频去哪了呢?':
            messagebox.showinfo('提示', '您输入的视频id不正确')
            return
        video_title.set(title[0])
        button1['command'] = lambda: download(url, flag)
    except:
        messagebox.showinfo('提示', '您输入的视频id不正确')
        return


# 点击下载
def download(url, flag):
    button1.config(state="disable")
    if flag == 0:
        video_url, audio_url, title = get_html_one(url)
        if video_url == None:
            return
        download_one(video_url, audio_url, title)
    else:
        cids, names = get_html_more(url)
        download_more(cids, names, url)

    print('下载完成,感谢您的使用')
    info.set('下载完成,感谢您的使用')


def tishi():
    messagebox.showinfo('提示', '请先进行搜索')


if __name__ == '__main__':
    root = Tk()
    win_width = root.winfo_screenwidth()
    win_height = root.winfo_screenheight()
    root.geometry('400x270+' + str(int(win_width / 2 - 200)) + '+' + str(int(win_height / 2 - 135)))
    root.minsize(400, 250)
    root.maxsize(400, 250)
    root.title('小破站下载器-敲出一片天')

    video_id = StringVar()
    video_id.set('请输入视频ID')

    entry = Entry(root, width=19, font=('隶书', 20), foreground='orange',
                  borderwidth=3, insertbackground='red', textvariable=video_id)
    entry.place(relx=0.02, rely=0.1)

    button = Button(root, width=7, text='搜索', font=('隶书', 18), bg='orange', fg='white', command=serach)
    button.place(relx=0.72, rely=0.09)

    v = IntVar()
    v.set(0)
    r1 = Radiobutton(text='单个视频', value=0, font=('隶书', 18), fg='orange', variable=v)
    r2 = Radiobutton(text='视频合集', value=1, font=('隶书', 18), fg='orange', variable=v)
    r1.place(relx=0.05, rely=0.25)
    r2.place(relx=0.45, rely=0.25)

    video_title = StringVar()
    video_title.set('视频标题')
    entry1 = Entry(root, width=30, font=('隶书', 15), fg='black', bg='#F0F0F0', relief='flat',
                   borderwidth=3, insertbackground='red', textvariable=video_title)
    entry1.place(relx=0.06, rely=0.4)

    button1 = Button(root, width=8, text='开始下载', font=('隶书', 12), bg='gray', fg='black', command=tishi)
    button1.place(relx=0.7, rely=0.4)

    info = StringVar()
    info.set('下载结果')
    entry_info = Entry(root, width=30, font=('隶书', 15), fg='red', bg='#F0F0F0', relief='flat',
                       borderwidth=3, textvariable=info)
    entry_info.place(relx=0.2, rely=0.6)

    label = Label(root, text='下载过程可能会出现无响应情况\n下载完就好了', width=30, font=('隶书', 15), fg='black', bg='#F0F0F0',
                  relief='flat',
                  borderwidth=3)
    label.place(relx=0.06, rely=0.8)

    root.mainloop()

4.python爬虫框架scrapy爬取B站排行榜数据并保存到MongoDB数据库

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class BiliItem(scrapy.Item):
    # define the fields for your item here like:
    _id = scrapy.Field()
    title = scrapy.Field()
    play_num = scrapy.Field()
    up_name = scrapy.Field()
    score = scrapy.Field()


bili.py

# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import scrapy
from bilibili.bili.bili.items import BiliItem


class BiliRankeSpider(scrapy.Spider):
    name = 'bili_ranke'
    allowed_domains = ['bilibili.com']
    start_urls = ['https://www.bilibili.com/ranking/all/0/0/3']

    def parse(self, response):
        titles = response.xpath('//div[@class="info"]//a[@class="title"]/text()').extract()
        play_nums = response.xpath('//div[@class="detail"]/span[@class="data-box"][1]/text()').extract()
        up_names = response.xpath('//div[@class="detail"]/a/span[@class="data-box"][1]/text()').extract()
        scores = response.xpath('//div[@class="pts"]/div/text()').extract()

        for title, play_num, up_name, score in zip(titles, play_nums, up_names, scores):
            item = BiliItem()
            item['title'] = title
            item['play_num'] = play_num
            item['up_name'] = up_name
            item['score'] = score
            yield item


pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo

class MoviesPipeline(object):

    def open_spider(self, spider):
        self.client = pymongo.MongoClient()

    def process_item(self, item, spider):
        self.client.bilibili.ranke.insert_one(item)
        return item

    def close_spider(self, spider):
        self.client.close()

附(MongoDB数据库python基本操作)

import pymongo

# 连接数据库
# 默认
client = pymongo.MongoClient()
# 自定义
# client = pymongo.MongoClient('ip',port)

# 选择实例(数据库)
person = client.person
# 选择集合(表)
student = person.student

#操作数据
# 查找所有信息
# result = student.find()
# for r in result:
#     print(r)

# print(result.next())

# 筛选
# result = student.find({"age":20})
# for r in result:
#     print(r)

# 排序
# result = student.find().sort("age",1)
# result = student.find().sort("age",pymongo.ASCENDING)
# for r in result:
#     print(r)

# 分页(偏移)
# result = student.find().limit(3)
# for r in result:
#     print(r)
#
#
# result = student.find().limit(3).skip(2)
# for r in result:
#     print(r)

# 统计
# result = student.find().count()
# print(result)

# 增加数据
# data = {"name":'曾强','age':22}
# student.insert(data)
# result = student.count()
# print(result)

# 删除数据
# data = {"name":'zq2','age':20}
# student.remove(data)

# 更新
data = {"name":"zq1"}
result = student.find_one(data)
print(result)
result["country"]="中国"
student.update(data,{'$set':result})

以上项目我都在bilibili上录有视频,看不明白可以去看一下视频,我的B站名:敲出一片天_bili

项目
python
爬虫
  • 作者:曾强(联系作者)
  • 发表时间:2020-04-11 22:32
  • 版权声明:自由转载-非商用-非衍生-保持署名
  • 转载声明:转载时请注明出处:www.zengqiang.club
  • 注:如果文章有错误,望请评论指出,谢谢;如果看了文章还有不明白的地方,欢迎进群与我交流。
  • 评论