全部内容已移至:
DownloadQisuuFile – 下载奇书网(qisuu.com)的电子书文件 v1.0
本来是去帮别人下载电子书的,需要手动一个个的点,烦死了,而且广告一堆。
索性,自己写了个脚本,去下载对应的电子书。
后来又反复修补bug,支持更多类型的下载地址等等,目前内容如下:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Function:
download txt ebook from:
Type1: 奇书网 → 女频言情 → 穿越架空 → 电子书列表
http://www.qisuu.com/soft/sort03/sort039/list39_2.html
such as:
http://www.qisuu.com/Shtml27341.html
->
http://dzs.qisuu.com/2012121606.rar
also rename to its title: 《所遇非淑》全集.rar
Type2: 奇书网 → 武侠仙侠 → 电子书列表
http://www.qisuu.com/soft/sort02/list2_1.html
such as:
http://www.qisuu.com/Shtml27681.html
->
http://dzs.qisuu.com/2013020206.rar
also rename to its title: 《洪荒之证道不朽》全集.rar
TODO:
totalPageNum -> should extract out
settings support:
typeStartUrl
startPageNum
downloadFolderName -> WuXiaXianXia, ChuanYueJiaKong
eg:
download_qisuu_ebook.py -s http://www.qisuu.com/soft/sort03/sort039/list39_1.html -n 12 -d ChuanYueJiaKong
download_qisuu_ebook.py -s http://www.qisuu.com/soft/sort02/list2_1.html -d WuXiaXianXia
Version: 2013-02-04
Author: Crifan Li
Contact: admin@crifan.org
-------------------------------------------------------------------------------
"""
#--------------------------------const values-----------------------------------
gConst = {
};
gCfg = {
'downloadFolder' : None,
};
gVal = {
'mainPreUrl' : None,
};
#---------------------------------import---------------------------------------
import os;
import re;
import sys;
sys.path.append("libs");
from BeautifulSoup import BeautifulSoup,Tag,CData;
import crifanLib;
import logging;
import argparse;
# import urllib;
# import json;
# import csv;
# import codecs;
def main():
newParser = argparse.ArgumentParser(description="Download (ebook) file from qisuu");
newParser.add_argument("-s", "--startTypeUrl", dest="startTypeUrl", help="start url of type. eg: http://www.qisuu.com/soft/sort03/sort039/list39_1.html, http://www.qisuu.com/soft/sort02/list2_1.html");
newParser.add_argument("-n", "--startPageNum", dest="startPageNum", type=int, default=1, help="start page number");
newParser.add_argument("-d", "--downloadFolder", dest="downloadFolder", default="download", help="foler name to store downloaded files");
args = newParser.parse_args();
argsDict = args.__dict__;
for eachArg in argsDict.keys():
exec(eachArg + " = args." + eachArg);
logging.info("startTypeUrl=%s", startTypeUrl);
logging.info("startPageNum=%d", startPageNum);
logging.info("downloadFolder=%s", downloadFolder);
gConst['downloadFolder'] = downloadFolder;
foundMainPrefUrl = re.search("(?P<mainPreUrl>http://www\.qisuu\.com/[\w/]+/list\d+_)\d+.html", startTypeUrl);
logging.debug("foundMainPrefUrl=%s", foundMainPrefUrl);
if(foundMainPrefUrl):
mainPreUrl = foundMainPrefUrl.group("mainPreUrl");
logging.info("mainPreUrl=%s", mainPreUrl);
gVal['mainPreUrl'] = mainPreUrl;
else:
logging.error("Can Not found main prefix url from %s", startTypeUrl);
sys.exit(-1);
#init
if(os.path.isdir(gConst['downloadFolder']) == False):
os.makedirs(gConst['downloadFolder']);# create dir recursively
#extract total page number
respHtml = crifanLib.getUrlRespHtml(startTypeUrl);
#logging.debug("respHtml=%s", respHtml);
respHtmlUni = respHtml.decode("GBK", 'ignore');
# <td class="tablebody1"> <a href="list39_72.html" title="尾页"><img border="0" src="/images/Last.gif" /></a> </td>
foundTotalPageNum = re.search(u'<a\s+href="list\d+_(?P<totalPageNum>\d+).html"\s+title="尾页">', respHtmlUni);
logging.debug("foundTotalPageNum=%s", foundTotalPageNum);
if(foundTotalPageNum):
totalPageNum = foundTotalPageNum.group("totalPageNum");
logging.info("totalPageNum=%s", totalPageNum);
totalPageNum = int(totalPageNum);
else:
logging.error("Can Not found total page number from %s resp html:\n%s", startTypeUrl, respHtml);
sys.exit(-2);
#for num in range(1, totalPageNum+1):
for pageNum in range(startPageNum, totalPageNum+1):
logging.info("============== page=%d ==============", pageNum);
#http://www.qisuu.com/soft/sort03/sort039/list39_1.html
#eachPageUrl = "http://www.qisuu.com/soft/sort03/sort039/list39_"+str(pageNum)+".html";
eachPageUrl = gVal['mainPreUrl'] + str(pageNum) + ".html";
logging.info("eachPageUrl=%s", eachPageUrl);
pageRespHtml = crifanLib.getUrlRespHtml(eachPageUrl);
#logging.debug("pageRespHtml=%s", pageRespHtml);
# <div class="mainListInfo">
# <div class="mainListName"><span class="mainSoftName"><a href="/Shtml27341.html" title="《所遇非淑》全集">《所遇非淑》全集</a></span></div><div class="mainListSize">2.06 MB</div><div class="mainListDate"><span class="oldDate"><span class="oldDate">2012-12-16</span></span></div><div class="mainListHist">Jar+TXT版</div>
# </div>
soup = BeautifulSoup(pageRespHtml, fromEncoding="GBK");
foundAllMainList = soup.findAll(name="span", attrs={"class":"mainSoftName"});
logging.debug("foundAllMainList=%s", foundAllMainList);
mainListLen = len(foundAllMainList);
logging.info("mainListLen=%s", mainListLen);
for urlIdx,eachMainList in enumerate(foundAllMainList):
urlNum = urlIdx + 1;
logging.info("-------------- page=%d, url=%d --------------", pageNum, urlNum);
logging.debug("eachMainList=%s", eachMainList);
href = eachMainList.a['href'];
logging.debug("href=%s", href);
#http://www.qisuu.com/Shtml27667.html
eachFileUrl = "http://www.qisuu.com" + href;
logging.info("eachFileUrl=%s", eachFileUrl);
fileRespHtml = crifanLib.getUrlRespHtml(eachFileUrl);
#logging.debug("fileRespHtml=%s", fileRespHtml);
soup = BeautifulSoup(fileRespHtml, fromEncoding="GBK");
h1 = soup.h1.string;
logging.info("h1=%s", h1);
ebooName = h1 + ".rar";
# <img src="/skin/newasp/download.gif"> <A oncontextmenu=ThunderNetwork_SetHref(this) onclick='return OnDownloadClick_Simple(this,2)' href='#' thunderResTitle='http://dzs.qisuu.com/2013012903.rar' thunderType='' thunderPid='02503' thunderHref='thunder://QUFodHRwOi8vZHpzLnFpc3V1LmNvbS8yMDEzMDEyOTAzLnJhclpa'class=downLinks>迅雷专用高速下载点</A><br><img src=/skin/newasp/download.gif> <A href='http://dzs.qisuu.com/2013012903.rar'><strong>本站下载地址</strong></A>
# </div></div>
#foundEbookAddress = re.search("thunderResTitle='(?P<ebookAddress>http://dzs\.qisuu\.com/\d+\.rar)'", fileRespHtml);
#http://www.qisuu.com/Shtml22388.html
#http://dzs.qisuu.com/tiansyiduity.rar
#foundEbookAddress = re.search("thunderResTitle='(?P<ebookAddress>http://dzs\.qisuu\.com/\w+\.rar)'", fileRespHtml);
#http://www.qisuu.com/Shtml23411.html
#<img src="/skin/newasp/download.gif"> <A oncontextmenu=ThunderNetwork_SetHref(this) onclick='return OnDownloadClick_Simple(this,2)' href='#' thunderResTitle='/soft/download.asp?softid=23411&downid=0&id=67531' thunderType='' thunderPid='02503' thunderHref='thunder://QUEvc29mdC9kb3dubG9hZC5hc3A/c29mdGlkPTIzNDExJmRvd25pZD0wJmlkPTY3NTMxWlo='class=downLinks>迅雷专用高速下载点</A><br><img src=/skin/newasp/download.gif> <A href='/soft/download.asp?softid=23411&downid=0&id=67531'><strong>本站下载地址</strong></A>
foundEbookAddress = re.search("thunderResTitle='(?P<ebookAddress>[^']+)'", fileRespHtml);
logging.debug("foundEbookAddress=%s", foundEbookAddress);
if(foundEbookAddress):
#http://dzs.qisuu.com/2013012903.rar
#http://dzs.qisuu.com/tiansyiduity.rar
ebookAddress = foundEbookAddress.group("ebookAddress");
logging.info("ebookAddress=%s", ebookAddress);
if(re.match("/soft/download\.asp\?", ebookAddress)):
#find out real ebook address
#http://www.qisuu.com/Shtml23411.html
#->
#http://www.qisuu.com/soft/download.asp?softid=23411&downid=0&id=67531
#it allow download
#actually it will auto direct to:
#http://dl.wrshu.com:111/moqiqxdxz.rar
downloadAddress = "http://www.qisuu.com" + ebookAddress;
logging.info("downloadAddress=%s", downloadAddress);
fixedEbookAddress = downloadAddress;
logging.info("Found partial ebook address, so fix it to: %s", fixedEbookAddress);
elif(re.match("http://dzs\.qisuu\.com/", ebookAddress)):
fixedEbookAddress = ebookAddress;
else:
logging.error("Can Not recognize this kind of ebook download address %s", ebookAddress);
logging.debug("fileRespHtml=%s", fileRespHtml);
continue;
#for
#http://www.qisuu.com/Shtml26634.html
#title is: 《神魔手下好当差/穿越之傀儡娃娃》全集
ebookFullName = os.path.join(gConst['downloadFolder'], crifanLib.removeInvalidCharInFilename(ebooName, '_'));
logging.info("dowloadinging ebookFullName=%s", ebookFullName);
crifanLib.downloadFile(fixedEbookAddress, ebookFullName, True);
#crifanLib.downloadFile(fixedEbookAddress, ebookFullName);
else:
logging.warning("Not found ebook address for url=%s", eachFileUrl);
logging.debug("record its fileRespHtml=\n%s", fileRespHtml);
if(foundEbookAddress == None):
#http://www.qisuu.com/Shtml23542.html
logging.info(u"this url=%s may be: 此电子书已删除,暂不提供下载", eachFileUrl);
###############################################################################
if __name__=="__main__":
scriptSelfName = crifanLib.extractFilename(sys.argv[0]);
logging.basicConfig(
level = logging.DEBUG,
format = 'LINE %(lineno)-4d %(levelname)-8s %(message)s',
datefmt = '%m-%d %H:%M',
filename = scriptSelfName + ".log",
filemode = 'w');
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler();
console.setLevel(logging.INFO);
# set a format which is simpler for console use
formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
# tell the handler to use this format
console.setFormatter(formatter);
logging.getLogger('').addHandler(console);
try:
main();
except:
logging.exception("Unknown Error !");
raise;
相关的库,可参考:
- crifanLib
- BeautifulSoup
转载请注明:在路上 » 【记录】写了个Python脚本去从qisuu网站下载电子书