前言
BeautifulSoup 是一个可以从HTML或XML文件中提取数据并解析的Python库, Requests 是一常用的可以获取和发送http的请求库, you_get 则是方便的下载各大网站的视频的命令行工具。整体流程上是,先用 Requests 请求获得网站源代码,再用 BeautifulSoup 解析网站并筛选出自己要的信息(如视频的url),最后用 you_get 下载。
例子
以下代码实现的是下载B站电影。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 15 12:06:21 2020
@author: weiquan fan
"""
from bs4 import BeautifulSoup as bs
import requests,re,os
def download(url, filename):
path_root = './Videos'
os.system('you-get -o {} -O {} {}'.format(path_root, filename, url))
url_base = 'https://www.bilibili.com/movie/?spm_id_from=333.851.b_62696c695f7265706f72745f6d6f766965.2'
response = requests.get(url_base)
page = response.text
soup = bs(page, 'html.parser')
vids = soup.findAll('li',attrs={'class':re.compile('video-item-biref.*?')})# bilibili
video_urls = []
counter=1
if(vids):
for v in vids:
#v_link = v.find('a')['href']
#v_name = v.find('img')['alt']
print(v)
v_link = v.find('a')['href']
v_name = v.find('img')['alt']
video_urls.append([v_link, v_name])
print(v_link,v_name)
try:
download(v_link, v_name)
except Exception:
print('can\'t download '+v_name+' in '+v_link)
counter -= 1
counter += 1
if(counter>15):
break