# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#quick-start
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<b>The Dormouse's story</b>
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
...
"""
soup = bs(html_doc, 'html.parser')
# print(soup.prettify())
# print(soup.title)
# string和get_text的功能大体相当
# print(soup.title.string)
# print(soup.title.get_text())
# 打印出父标签名
# print(soup.p.parent.name)
# print(soup.findAll('a'))
# 通过id查找内容
# print(soup.find(id='link1').string)
#打印出所有标签的内容
# for link in soup.find_all('a'):
# print(link.get_text())
#通过标签名和类名获取赔偿额
# print(soup.find('p',{'class', 'story'}))
# print(soup.find('p',{'class', 'story'}).get_text())
# print(soup.find_all("p", class_="story"))
#抓取网页内容
# resp = urlopen('http://www.baidu.com')
# baidu = bs(resp, 'html.parser')
# print(baidu.prettify())
#打印出以b开头的标签名
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
#打印出href以http://example.com的a标签
print(soup.findAll('a', href=re.compile(r"^http://example\.com")))
抓取Wiki中的链接和文本并存入MySQL数据库(PyMySQL)
import pymysql.cursors
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
resp = urlopen("https://en.wikipedia.org/wiki/Main_Page")
soup = BeautifulSoup(resp, 'html.parser')
# print (soup)
listUrls = soup.findAll('a', href=re.compile('^/wiki/'))
# print(listUrls)
for url in listUrls:
# print(url.get_text())
if not re.search('\.(jpg|JPG)', url["href"]):
print(url.get_text(), ' : ', 'https://en.wikipedia.org' + url["href"])
# Connect to the database
connection = pymysql.connect(host='localhost',
user='root',
password='',
db='wikiUrls',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
try:
with connection.cursor() as cursor:
sql = 'insert into <code>url</code> (<code>url</code>, <code>href</code>) values(%s, %s)'
cursor.execute(sql, (url.get_text(), 'https://en.wikipedia.org' + url["href"]))
connection.commit()
finally:
connection.close()
读取txt文件
from urllib.request import urlopen
text = urlopen('https://en.wikipedia.org/robots.txt')
print(text.read().decode('utf-8'))
读取pdf文件
https://pypi.python.org/pypi/pdfminer3k
# pip install pdfminer3k
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
# 获取文档对象
fp = open('media/naacl06-shinyama.pdf', 'rb')
# 创建一个与文档相关的解释器
parser = PDFParser(fp)
# 创建PDF文档对象
doc = PDFDocument()
# 连接解释器和文档对象
parser.set_document(doc)
doc.set_parser(parser)
# 初始化文档
doc.initialize('')
# 创建PDF资源管理器
resource = PDFResourceManager()
# 创建参数分析器
laparam = LAParams()
# 创建一个聚合器
device = PDFPageAggregator(resource, laparams=laparam)
# 创建页面解释器
interpreter = PDFPageInterpreter(resource, device)
# 使用文档对象得到页面的内容
for page in doc.get_pages():
# 页面解释器来读取
interpreter.process_page(page)
# 使用聚合器来获得内容
layout = device.get_result()
for out in layout:
if hasattr(out, 'get_text'):
print(out.get_text())
慕课网学习笔记