不是教程,只是给自己参考用,各位客官请留神,小白可能不明白啥意思。
#!/usr/bin/python
# encoding: utf-8
from calibre.web.feeds.recipes import BasicNewsRecipe
class Pro_Git_Chinese(BasicNewsRecipe):
title = 'QCustomplot'
description = 'QCustomplot介绍'
cover_url = ''
_author_ = '朝十晚八'
url_pre = 'https://www.cnblogs.com/swarmbees/category/908110.html'
no_stylesheets = True #去除css
keep_only_tags = [{ 'class': 'blogpost-body' }] #仅在blogpost-body里面查找
simultaneous_downloads = 1 #最大下载线程,默认为5
def parse_index(self):
# recipe的核心method,通过分析目录页,找到各页面链接,并抓取内容,返回一个较复杂的数据结构
soup = self.index_to_soup(self.url_pre)#目录页
#查找div,其class属性为entrylist,因为列表是处于<div class="entrylist">容器中
div = soup.find('div', {'class': 'entrylist'})#目录页的寻找范围
articles = []
for link in div.findAll('a', id=True):#循环查找标签a,且其id要为真。
til = link.contents[0].strip() #获取标题,去除空格
url = link['href'] #获取标题的链接
a = { 'title': til, 'url': url }
articles.insert(0,a) #append(a)是在列表末尾追加,现在改成insert(0,a)在列表前面插入
results = [(self.title, articles)] #结果由标题和文章组成
return results
#!/usr/bin/python
# encoding: utf-8
from calibre.web.feeds.recipes import BasicNewsRecipe
class Pro_Git_Chinese(BasicNewsRecipe):
title = 'QT学习之路2'
description = ''
__author__ = '豆子'
cover_url = ''
simultaneous_downloads = 5
url_pre = 'https://www.devbean.net/2012/08/qt-study-road-2-catelog/'
no_stylesheets = True
remove_javascript = True
keep_only_tags = [{ 'class': 'thecontent clearfix' }]
def parse_index(self):
# recipe的核心method,通过分析目录页,找到各页面链接,并抓取内容,返回一个较复杂的数据结构
soup = self.index_to_soup(self.url_pre)#目录页
div = soup.find('ol')#目录页的寻找范围
articles = []
for link in div.findAll('a'):
til = link.contents[0].strip()
url = link['href']
a = { 'title': til, 'url': url}
articles.append(a)
results = [(self.title, articles)]
return results