bs4作为python解析html与xml的一枚神器,很有必要简明教程一下。
如何安装
在Debian和Ubuntu下可以采用如下方式:
$ apt-get install python-bs4
也可以通过PyPi的安装方式:
$ easy_install beautifulsoup4
$ pip install beautifulsoup4
当然,还可以通过下载源码,用setup.py的方式进行安装:
$ python setup.py install
简明使用
下面先举个例子:
from bs4 import BeautifulSoup
html_doc = """
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
soup = BeautifulSoup(html_doc)
print(soup.prettify())
# 我们可以看到html代码已经被格式化了
#
#
#
# The Dormouse's story
#
#
#
#
#
# The Dormouse's story
#
#
#
# Once upon a time there were three little sisters; and their names were
#
# Elsie
#
# ,
#
# Lacie
#
# and
#
# Tillie
#
# ; and they lived at the bottom of a well.
#
#
# ...
#
#
#
# 下面我们再看看如何获得html的各个node
soup.title
# The Dormouse's story
soup.title.name
# u'title'
soup.title.string
# u'The Dormouse's story'
soup.title.parent.name
# u'head'
soup.p
# The Dormouse's story
soup.p['class']
# u'title'
soup.a
# Elsie
soup.find_all('a')
# [Elsie,
# Lacie,
# Tillie]
soup.find(id="link3")
# Tillie
# 如果我们想提取所有的锚链接,那实在是太简单了
for link in soup.find_all('a'):
print(link.get('href'))
# http://example.com/elsie
# http://example.com/lacie
# http://example.com/tillie
# 如果我们只想提取所有的文字呢?那也还是很简单
print(soup.get_text())
# The Dormouse's story
#
# The Dormouse's story
#
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
#
# ...
进阶使用
##Tag
soup = BeautifulSoup('Extremely bold')
tag = soup.b
type(tag)
#
##Name
tag.name
# u'b'
tag.name = "blockquote"
tag
# Extremely bold
##Attributes
tag['class']
# u'boldest'
tag.attrs
# {u'class': u'boldest'}
tag['class'] = 'verybold'
tag['id'] = 1
tag
# Extremely bold
del tag['class']
del tag['id']
tag
# Extremely bold
tag['class']
# KeyError: 'class'
print(tag.get('class'))
# None
##Multi-valued Attributes
css_soup = BeautifulSoup('')
css_soup.p['class']
# ["body", "strikeout"]
css_soup = BeautifulSoup('')
css_soup.p['class']
# ["body"]
id_soup = BeautifulSoup('')
id_soup.p['id']
# 'my id'
rel_soup = BeautifulSoup('Back to the homepage
')
rel_soup.a['rel']
# ['index']
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)
# Back to the homepage
xml_soup = BeautifulSoup('', 'xml')
xml_soup.p['class']
# u'body strikeout'
##comments
markup = ""
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)
#
comment
# u'Hey, buddy. Want to buy a used parser'
print(soup.b.prettify())
#
#
#
from bs4 import CData
cdata = CData("A CDATA block")
comment.replace_with(cdata)
print(soup.b.prettify())
#
#
更多内容就不搬了,请移步到bs4英文文档