代码如下:
- # -*- coding: UTF-8 -*-
- """
- @Author :叶庭云
- @公众号 :修炼Python
- @CSDN :https://yetingyun.blog.csdn.net/
- """
- import requests
- from bs4 import BeautifulSoup
- import openpyxl
- from fake_useragent import UserAgent
- import logging
- # 日志输出的基本配置
- logging.basicConfig(level=logging.INFO, format='%(asctime)s – %(levelname)s: %(message)s')
- # 随机产生请求头
- ua = UserAgent(verify_ssl=False, path='fake_useragent.json')
- wb = openpyxl.Workbook() # 创建工作簿对象
- sheet = wb.active # 获取工作簿的活动表
- sheet.title = "movie" # 工作簿重命名
- sheet.append(["排名", "电影名", "导演和主演", "上映时间", "上映地区", "电影类型", "评分", "评价人数", "引言"])
- def random_ua():
- headers = {
- "Accept-Encoding": "gzip",
- "Connection": "keep-alive",
- "User-Agent": ua.random
- }
- return headers
- def scrape_html(url):
- resp = requests.get(url, headers=random_ua())
- # print(resp.status_code, type(resp.status_code))
- if resp.status_code == 200:
- return resp.text
- else:
- logging.info('请求网页失败')
- def get_data(page):
- global rank
- url = f"https://movie.douban.com/top250?start={25 * page}&filter="
- html_text = scrape_html(url)
- soup = BeautifulSoup(html_text, 'html.parser')
- lis = soup.find_all('div', class_='item')
- for li in lis:
- name = li.find('div', class_='hd').a.span.text
- temp = li.find('div', class_='bd').p.text.strip().split(' ')
- director_actor = temp[0]
- temptemp1 = temp[1].rsplit('/', 2)
- time_, area, genres = [item.strip() for item in temp1]
- quote = li.find('p', class_='quote')
- # 有些电影信息没有一句话引言
- if quote:
- quotequote = quote.span.text
- else:
- quote = None
- rating_score = li.find('span', class_='rating_num').text
- rating_num = li.find('div', class_='star').find_all('span')[-1].text
- sheet.append([rank, name, director_actor, time_, area, genres, rating_score, rating_num, quote])
- logging.info([rank, name, director_actor, time_, area, genres, rating_score, rating_num, quote])
- rank += 1
- if __name__ == '__main__':
- rank = 1
- for i in range(10):
- get_data(i)
- wb.save(filename='movie_info4.xlsx')
PyQuery
- 每个网页,都有一定的特殊结构和层级关系,并且很多节点都有 id 或 class 作为区分,我们可以借助它们的结构和属性来提取信息。
- 强大的 HTML 解析库:pyquery,利用它,我们可以直接解析 DOM 节点的结构,并通过 DOM 节点的一些属性快速进行内容提取。
如下示例:在解析 HTML 文本的时候,首先需要将其初始化为一个 pyquery 对象。它的初始化方式有多种,比如直接传入字符串、传入 URL、传入文件名等等。
- from pyquery import PyQuery as pq
- html = '''
- <div>
- <ul class="clearfix">
- <li class="item-0">first item</li>
- <li class="item-1"><a href="link2.html">second item</a></li>
- <li><img src="http://pic.netbian.com/uploads/allimg/210107/215736-1610027856f6ef.jpg"></li>
- <li><img src="http://pic.netbian.com//uploads/allimg/190902/152344-1567409024af8c.jpg"></li>
- </ul>
- </div>
- '''
- doc = pq(html)
- print(doc('li'))
结果如下:
- <li class="item-0">first item</li>
- <li class="item-1"><a href="link2.html">second item</a></li>
- <li><img src="http://pic.netbian.com/uploads/allimg/210107/215736-1610027856f6ef.jpg"/></li>
- <li><img src="http://pic.netbian.com//uploads/allimg/190902/152344-1567409024af8c.jpg"/></li>
首先引入 pyquery 这个对象,取别名为 pq,然后定义了一个长 HTML 字符串,并将其当作参数传递给 pyquery 类,这样就成功完成了初始化。接下来,将初始化的对象传入 CSS 选择器。在这个实例中,我们传入 li 节点,这样就可以选择所有的 li 节点。
代码如下:
- # -*- coding: UTF-8 -*-
- """
- @Author :叶庭云
- @公众号 :修炼Python
- @CSDN :https://yetingyun.blog.csdn.net/
- """
- import requests
- from pyquery import PyQuery as pq
- import openpyxl
- from fake_useragent import UserAgent
- import logging
- # 日志输出的基本配置
- logging.basicConfig(level=logging.INFO, format='%(asctime)s – %(levelname)s: %(message)s')
- # 随机产生请求头
- ua = UserAgent(verify_ssl=False, path='fake_useragent.json')
- wb = openpyxl.Workbook() # 创建工作簿对象
- sheet = wb.active # 获取工作簿的活动表
- sheet.title = "movie" # 工作簿重命名
- sheet.append(["排名", "电影名", "导演和主演", "上映时间", "上映地区", "电影类型", "评分", "评价人数", "引言"])
- def random_ua():
- headers = {
- "Accept-Encoding": "gzip",
- "Connection": "keep-alive",
- "User-Agent": ua.random
- }
- return headers
- def scrape_html(url):
- resp = requests.get(url, headers=random_ua())
- # print(resp.status_code, type(resp.status_code))
- if resp.status_code == 200:
- return resp.text
- else:
- logging.info('请求网页失败')
- def get_data(page):
- global rank
- url = f"https://movie.douban.com/top250?start={25 * page}&filter="
- html_text = scrape_html(url)
- doc = pq(html_text)
- lis = doc('.grid_view li')
- for li in lis.items():
- name = li('.hd a span:first-child').text()
- temp = li('.bd p:first-child').text().split(' ')
- director_actor = temp[0]
- temptemp1 = temp[1].rsplit('/', 2)
- time_, area, genres = [item.strip() for item in temp1]
- quote = li('.quote span').text()
- rating_score = li('.star .rating_num').text()
- rating_num = li('.star span:last-child').text()
- sheet.append([rank, name, director_actor, time_, area, genres, rating_score, rating_num, quote])
- logging.info([rank, name, director_actor, time_, area, genres, rating_score, rating_num, quote])
- rank += 1
- if __name__ == '__main__':
- rank = 1
- for i in range(10):
- get_data(i)
- wb.save(filename='movie_info3.xlsx')