数据提取之bs4（BeautifuSoup4）模块与Css选择器

BeautifuSoup4

from bs4 import BeautifulSoup

创建对象 <class 'bs4.BeautifulSoup'>

soup = BeautifulSoup(源码, '解析器')

bs4标签种类

（1）tag: 标签
print(soup.title, type(soup.title))
（2）获取标签里面的文本内容, 可导航的字符串，数据类型是<class 'bs4.element.NavigableString'>对象，可以使用字符串的方法
title = soup.title
# string
print(title.string, type(title.string))
（3）注释
# 注释 <class 'bs4.element.Comment'>
html = ''
soup2 = BeautifulSoup(html, 'lxml')

遍历文档树

# 解析数据
head_tag = soup.p #默认获取第一个p标签
# 获取标签的子节点, .contents: 返回的是一个所有子节点的列表
# print(head_tag.contents)

print(head_tag.children) # 返回的是一个生成器对象，通过循环遍历取值
for head in head_tag.children:
print(head)

源码：

# 1. 导入模块
from bs4 import BeautifulSoup# 源码
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p><p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p><p class="story">...</p>
"""# 2. 创建对象  <class 'bs4.BeautifulSoup'>
soup = BeautifulSoup(html_doc, 'lxml')

获取节点文本内容

.string

# 通过上一级标签，去获取子级的标签文本内容
# head = soup.head
# print(head.string)

.text

# print(head.text) # 获取的是多个子级标签的文本内容，内容都拼接在一起

# strings/stripped_strings
contents = soup.html
# print(contents.string) # 没有获取
# print(contents.text)

.strings

# print(contents.strings) # <generator object Tag._all_strings at 0x000001E214912820> 生成器对象
# strings可以获取这个标签下的所有的文本，文本内容包含很多空行
# for data in contents.strings:
# print(data)

.stripped_strings

# stripped_strings可以获取这个标签下的所有的文本，去除了多空行
# for data in contents.stripped_strings:
# print(data)

总结：

获取标签文本内容
string: 标签里面只有一个标签有文本内容，可导航的字符串，（使用 BeautifulSoup 解析文档后，标签（Tag）之间的文本会被转换为NavigableString对象，比如：Hello, World!，其中的"Hello, World!"就是一个NavigableString对象。它之所以被称为 "可导航"，是因为他可以通过它访问周围的其他元素（如父标签、兄弟标签等），形成一个树形结构）
text: 将所有的标签文本内容拼接在一起
strings: 依次获取所有的标签文本内容，包含空行，返回的是一个生成器对象
stripped_strings: 依次获取所有的标签文本内容，去除多余的空行看，返回的是一个生成器对象

获取父节点

.parent

title_tag = soup.title
print(title_tag) #<title>The Dormouse's story</title>
print(title_tag.parent) #<head><title>The Dormouse's story</title></head>

.parents

# 一旦是获取的是多个标签内容，返回的就是生成器
a_tag = soup.a
print(a_tag.parents) #<generator object PageElement.parents at 0x0000027F14C502E0>
for p in a_tag.parents:
print(p.name)
'''
p
body
html
[document]
'''

获取同级节点

源代码：

from bs4 import BeautifulSouphtml2 = """<a>
<b>bbbb</b><c>ccccc</c><d>dddd</d>
</a>"""soup = BeautifulSoup(html2, 'lxml')
b_tag = soup.c
# print(b_tag.next_sibling)  # 跟他相邻的下一个节点
# print(b_tag.next_siblings)  # 跟他相邻的下一个节点print(b_tag.previous_sibling)  # 上一个所有的兄弟节点
print(b_tag.previous_siblings)  # 上一个所有的兄弟节点

.next_sibling/.next_siblings

# print(b_tag.next_sibling) # 跟他相邻的下一个节点
# print(b_tag.next_siblings) # 跟他相邻的下一个所有节点

.previous_sibling)/.previous_siblings

print(b_tag.previous_sibling) # 上一个所有的兄弟节点
print(b_tag.previous_siblings) # 上一个所有的兄弟节点

方法使用（核心）

find(): 查找一个

# 获取a标签
a_tag = soup.find('a') # 默认查找第一个标签
print(a_tag)

find_all():查找所有

a_tag_all = soup.find_all('a')
print(a_tag_all) # 返回的是一个列表，用循环遍历

# 同时找两个标签，标签使用元组或列表使其成为一个整体，不能分开写
a_p_tag = soup.find_all(('title', 'b')) #也可以写为：a_p_tag = soup.find_all(['title', 'b'])
print(a_p_tag)

案例：

# 导入模块
from bs4 import BeautifulSouphtml = """
<table class="tablelist" cellpadding="0" cellspacing="0"><tbody><tr class="h"><td class="l" width="374">职位名称</td><td>职位类别</td><td>人数</td><td>地点</td><td>发布时间</td></tr><tr class="even"><td class="l square"><a target="_blank" href="position_detail.php?id=33824&keywords=python&tid=87&lid=2218">22989-金融云区块链高级研发工程师（深圳）</a></td><td>技术类</td><td>1</td><td>深圳</td><td>2017-11-25</td></tr><tr class="odd"><td class="l square"><a target="_blank" href="position_detail.php?id=29938&keywords=python&tid=87&lid=2218">22989-金融云高级后台开发</a></td><td>技术类</td><td>2</td><td>深圳</td><td>2017-11-25</td></tr><tr class="even"><td class="l square"><a target="_blank" href="position_detail.php?id=31236&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐运营开发工程师（深圳）</a></td><td>技术类</td><td>2</td><td>深圳</td><td>2017-11-25</td></tr><tr class="odd"><td class="l square"><a target="_blank" href="position_detail.php?id=31235&keywords=python&tid=87&lid=2218">SNG16-腾讯音乐业务运维工程师（深圳）</a></td><td>技术类</td><td>1</td><td>深圳</td><td>2017-11-25</td></tr><tr class="even"><td class="l square"><a target="_blank" href="position_detail.php?id=34531&keywords=python&tid=87&lid=2218">TEG03-高级研发工程师（深圳）</a></td><td>技术类</td><td>1</td><td>深圳</td><td>2017-11-24</td></tr><tr class="odd"><td class="l square"><a target="_blank" href="position_detail.php?id=34532&keywords=python&tid=87&lid=2218">TEG03-高级图像算法研发工程师（深圳）</a></td><td>技术类</td><td>1</td><td>深圳</td><td>2017-11-24</td></tr><tr class="even"><td class="l square"><a target="_blank" href="position_detail.php?id=31648&keywords=python&tid=87&lid=2218">TEG11-高级AI开发工程师（深圳）</a></td><td>技术类</td><td>4</td><td>深圳</td><td>2017-11-24</td></tr><tr class="odd"><td class="l square"><a target="_blank" href="position_detail.php?id=32218&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td><td>技术类</td><td>1</td><td>深圳</td><td>2017-11-24</td></tr><tr class="even"><td class="l square"><a target="_blank" href="position_detail.php?id=32217&keywords=python&tid=87&lid=2218">15851-后台开发工程师</a></td><td>技术类</td><td>1</td><td>深圳</td><td>2017-11-24</td></tr><tr class="odd"><td class="l square"><a id="test" class="test" target='_blank' href="position_detail.php?id=34511&keywords=python&tid=87&lid=2218">SNG11-高级业务运维工程师（深圳）</a></td><td>技术类</td><td>1</td><td>深圳</td><td>2017-11-24</td></tr></tbody>
</table>
<a href="https://www.baid.com">百度一下</a>
<a href="https://www.douban.com">豆瓣一下</a>
<a herf="https://www.python.com">python一下</a>
"""# 实例化一个对象
soup = BeautifulSoup(html, 'lxml')# 获取的是所有的tr标签
# trs = soup.find_all('tr')# 获取class="even"标签, bs4 class属性定位
# trs = soup.find_all('tr', class_="even")
# trs = soup.find_all('tr', attrs={"class": "even"})
# print(trs)
# <a id="test" class="test"
# a = soup.find_all('a', id="test", class_="test")
# find_all没有获取到数据，返回的是一个空列表
# trs = soup.find_all('a', attrs={"class": "test", "id":"test"})
# print(trs)# 获取所有a标签里面的href属性值
# a_lst = soup.find_all('a')
# for a in a_lst:
#     '''
#     get(): 属性不存在，返回的是一个none
#     对象['属性名']: 属性名 不存在，报错
#     '''
#     # href = a.get('href')
#     href = a['href']
#     print(href)# 获取职位名称
trs = soup.find_all('tr')[1:]
for tr in trs:# print(tr)a = tr.find('a')print(a.string)

获取class="even"标签, bs4 class属性定位

# trs = soup.find_all('tr', class_="even")
# trs = soup.find_all('tr', attrs={"class": "even"})
# print(trs)

通过id和属性定位

# <a id="test" class="test"
# a = soup.find_all('a', id="test", class_="test")
# find_all没有获取到数据，返回的是一个空列表
# trs = soup.find_all('a', attrs={"class": "test", "id":"test"})
# print(trs)

获取所有a标签里面的href属性值

get(): 属性不存在，返回的是一个none

对象['属性名']: 属性名不存在，报错

# a_lst = soup.find_all('a')
# for a in a_lst:
#     '''
#     get(): 属性不存在，返回的是一个none
#     对象['属性名']: 属性名不存在，报错
#     '''
#     # href = a.get('href')
#     href = a['href']
#     print(href)

# 获取职位名称
trs = soup.find_all('tr')[1:]
for tr in trs:
# print(tr)
a = tr.find('a')
print(a.string)

Css选择器

select：查找标签，默认查找所有，并且返回的数据类型list

print(soup.select('a')) #查找所以的a标签

定位class='sister'的标签，使用.sister .属性值

print(soup.select('.sister'))

定位id="link1"的标签，使用#link1

print(soup.select('#link1'))

定位p标签并且id值为link1，使用p#link1

print(soup.select('p#link1'))

获取的title标签下的文本信息，select: 只有一个元素，返回的数据类型还是列表

print(soup.select('title')[0].string)
print(soup.select('title')[0].text)
print(list(soup.select('title')[0].strings))
print(list(soup.select('title')[0].stripped_strings))

p a: 选择所有位于元素内a元素,无论嵌套了多少层，都能找到

print(soup.select('p a'))

p>a: 选择所有作为直接的子元素标签

print(soup.select('p>a'))

<p class="story">Once upon a time there were three little sisters; and their names were<span><a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>,<a href="sister">余承东</a>;and they lived at the bottom of a well.</span>
</p>