浮云脱水小说站的搭建
- 4. Codeimport urllibimport urlparsefrom BeautifulSoup import BeautifulSoupclass BaiduZhibo(object): """可迭代的对象,不停返回下一页含有给定username的内容(默认为楼主) 返回格式:字典:{ "url": "..." #当前链接地址"page": 5 #当前页数"content": [...] #列表,里面有当前页每一个指定username的发言内容}参数:url: 帖子地址
- 5. Codeobj_name: 需要抓取的用户昵称,默认为楼主limit: 限定抓取页面的数量,默认无限制html: 设定输出格式,True不做处理,False替换换行符、空格""" def __init__(self, url, obj_name=None, limit=0, html=False): … … def next(self): if (self.limit and self.counter == self.limit) or (self.nowurl is None): print "finished." raise StopIteration … … def __iter__(self): return self … …if __name__ == ‘__main__’: # 初始化一个BaiduZhibo对象,然后拼装html页面
- 8. 表结构设计CREATE TABLE "nowater_novel" ( "id" serial NOT NULL PRIMARY KEY, "title" varchar(100) NOT NULL, "author" varchar(20) NOT NULL, "content_type" integer NULL, --小说类型 "type" varchar(10) NOT NULL default 'baidu', --来源类型 “url” varchar(200) NOT NULL unique, --来源url “update_interval” integer default 10, --更新间隔 "email" varchar(40) NULL, --推荐人email,如果有的话 “last_update_floor” integer NOT NULL default 0, --最后更新的楼层 “last_update_url” varchar(500) NOT NULL, --最后更新的url “last_update_time” timestamp with time zone, --最后更新时间 "ip" varchar(20) NOT NULL default '', "status" integer NOT NULL default 0, "jointime" timestamp with time zone NOT NULL DEFAULT CURRENT_TIMESTAMP);NOVEL_STATUS = { 0: u"未初始化", 1: u"更新中...", 2: u"等待更新", 3: u"已完结", 4: u"更新异常", 100: u"已删除"}
- 13. 项目结构[nowater_web]web.py包含两个appNovel: …Admin: …添加novel小说源信息[database]… …Nginx使用twisted.web维持一个长连接该长连接起到实时返回抓取进度的作用,同时writer.py也通过这个文件提供的http接口与之交互[nowater]main.pybasedb.pybaidu_crawler.pywriter.pylong_looping.pytianya_crawler.py… …提供抓取完成的html静态文件调用http接口驱动爬虫对于抓取进度方面的长连接请求反向代理到这里来
- 14. Comet实时显示抓取进度class Progress(resource.Resource): def render_GET(self, request): id = request.args.get("id", [""])[0]client_id = request.args.get("client_id", [""])[0] client = mb.connection_made(id, client_id, request) if isinstance(client, str): return client else:request.notifyFinish().addErrback(self.disconnect, client) return server.NOT_DONE_YET def disconnect(self, ret, client):client.request = Noneclient.status = False
- 15. Comet实时显示抓取进度 def render_POST(self, request): """通知某小说更新到了某一页page: start 开始r_12 表示初始页面的12页o_12 表示小说页面的12页end 结束""" id = request.args.get("id", [""])[0] page = request.args.get("page", [""])[0]mb.new_msg(id, page) return "ok"
- 16. 问题和解决下载txt文件的实现Nginxsendfilelocation /download_txt/ { internal; alias /;}PY:file = get_txt_file(str(id), web.utf8(novel.title)) if file:web.replace_header("Content-Type", "text/plaintext")web.replace_header("Content-Disposition", "attachment; filename=%s.txt" % id)web.header("X-Accel-Redirect", "/download_txt/%s" % file) return "ok"