PyQuery 库学习笔记 发表于 2017-05-15 | 分类于 爬虫 | | 阅读次数 次 爬虫学习路线第三站 - PyQuery库的使用 初始化字符串初始化12345678910111213141516from pyquery import PyQuery as pyhtml = '''<div> <ul> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)print(doc('li')) URL初始化12345678from pyquery import PyQuery as py# 通过URL来获取doc = py(url='http://www.baidu.com')# <class 'pyquery.pyquery.PyQuery'>print(type(doc('title')))# 输出选中的head标签print(doc('head')) 文件的初始化12345678from pyquery import PyQuery as py# 通过文件来获取doc = py(filename='demo1.html')# <class 'pyquery.pyquery.PyQuery'>print(type(doc('li')))# 输出所有的li标签print(doc('li')) 基本的CSS选择器1234567891011121314151617from pyquery import PyQuery as pyhtml = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)# 选中id为container中的class为list中的li标签print(doc('#container .list li')) 查找元素子元素12345678910111213141516171819202122232425262728293031from pyquery import PyQuery as pyhtml = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)# 获取class为list的元素items = doc('.list')# <class 'pyquery.pyquery.PyQuery'>print(type(items))print(items)# 在先前找到的元素中获取li标签lis = items.find('li')# <class 'pyquery.pyquery.PyQuery'>print(type(lis))print(lis)# 获取先前找到的元素中的所有子元素lis2 = items.children()print(type(lis2))print(lis2)# 获取先前找到的元素中的class为active的元素li3 = items.children('.active')print(li3) 父元素1234567891011121314151617181920212223242526272829from pyquery import PyQuery as pyhtml = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)# 获取class为list的元素items = doc('.list')# 获取所选元素的父元素container = items.parent()print(type(container))print(container)print("==========================")# 获取所选元素的所有父元素parents = items.parents()print(type(parents))print(parents)print("==========================")# 获取所选元素的所有父元素中class为container的元素parent = items.parents('.container')print(parent) 兄弟元素123456789101112131415161718192021from pyquery import PyQuery as pyhtml = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)# 获取class为list的元素items = doc('.list')li = doc('.list .item-0.active')# 查找选中元素的所有兄弟元素(不包含自己)print(li.siblings())# 查找选中元素的所有兄弟元素中class为active的元素(不包含自己)print(li.siblings('.active')) 遍历单个元素123456789101112131415161718from pyquery import PyQuery as pyhtml = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)# 选中单个单个元素li = doc('.item-0.active')print(li) 多个元素123456789101112131415161718192021from pyquery import PyQuery as pyhtml = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)# 查找所有li标签lis = doc('li').items()# <class 'generator'>print(type(lis))for li in lis: print(li) 获取信息获取属性123456789101112131415161718192021from pyquery import PyQuery as pyhtml = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)a = doc('.item-0.active a')# <a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a>print(a)# link3.html 获取选中标签的href属性print(a.attr('href'))# link3.htmlprint(a.attr.href) 获取文本12345678910111213141516171819from pyquery import PyQuery as pyhtml = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)a = doc('.item-0.active a')# <a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a>print(a)# 获取a标签的内容print(a.text()) 获取HTML12345678910111213141516171819from pyquery import PyQuery as pyhtml = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)li = doc('.item-1.active')# <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li>print(li)# 获取li标签的HTMLprint(li.html()) DOM操作addClass、removeClass12345678910111213141516171819202122from pyquery import PyQuery as pyhtml = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)li = doc('.item-0.active')print(li)# 移除classli.removeClass('active')print(li)# 添加classli.addClass('active')print(li) attr、css12345678910111213141516171819202122from pyquery import PyQuery as pyhtml = '''<div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div>'''doc = py(html)li = doc('.item-0.active')print(li)# 添加name属性li.attr('name', 'link')print(li)# 添加css样式li.css('font-size', '14px')print(li) remove1234567891011121314from pyquery import PyQuery as pyhtml = '''<div class="wrap"> Hello, World <p>This is a paragraph.</p> </div>'''doc = py(html)wrap = doc('.wrap')print(wrap.text())# 在选择的元素中找到p标签并移除wrap.find('p').remove()print(wrap.text()) 其他DOM方法 http://pyquery.readthedocs.io/en/latest/api.html 伪类选择器1234567891011121314151617181920212223242526272829303132333435from pyquery import PyQuery as pyhtml = '''<div class="wrap"> <div id="container"> <ul class="list"> <li class="item-0">first item</li> <li class="item-1"><a href="https://ask.hellobi.com/link2.html">second item</a></li> <li class="item-0 active"><a href="https://ask.hellobi.com/link3.html"><span class="bold">third item</span></a></li> <li class="item-1 active"><a href="https://ask.hellobi.com/link4.html">fourth item</a></li> <li class="item-0"><a href="https://ask.hellobi.com/link5.html">fifth item</a></li> </ul> </div> </div>'''doc = py(html)# 找到第一个lili = doc('li:first-child')print(li)# 找到最后一个lili = doc('li:last-child')print(li)# 找到第二个lili = doc('li:nth-child(2)')print(li)# 找到第三个到最后的lili = doc('li:gt(2)')print(li)# 找到第偶数个lili = doc('li:nth-child(2n)')print(li)# 找到内容包含second的lili = doc('li:contains(second)')print(li) 更多选择器 http://www.w3school.com.cn/css/index.asp 官方文档 http://pyquery.readthedocs.io/ 坚持原创技术分享,您的支持将鼓励我继续创作! 赏 微信打赏