应朋友要求,希望把我crifan的所有电子书打包发给他供参考。
本来打算手动下载的,突然发现有100+本,手动下载效率较低。
所以考虑用Python脚本去实现。
输入文件用之前的:
去写代码
思路是:
把docbook和Gitbook的所有的book都提取出来
然后生成对应的pdf文件(可设置其他不同格式)的url连接
然后再用Python的requests去下载
【总结】
弄了一个晚上,最后优化是:
# Function: download all crifan ebook files
# Author: Crifan Li
# Update: 20201212
# Latest: https://github.com/crifan/crifan_ebook_readme/blob/master/downloadAllBooks.py
import os
import codecs
import re
import requests
import time
################################################################################
# Config
################################################################################
OutputFolder = os.path.join("downloaded", "pdf")
# 'downloaded/pdf'
RequestsProxies = {
"http" : "http://127.0.0.1:58591",
"https" : "http://127.0.0.1:58591",
}
################################################################################
# Util Functions
################################################################################
def createFolder(folderFullPath):
"""
create folder, even if already existed
Note: for Python 3.2+
"""
os.makedirs(folderFullPath, exist_ok=True)
def loadTextFromFile(fullFilename, fileEncoding="utf-8"):
"""load file text content from file"""
with codecs.open(fullFilename, 'r', encoding=fileEncoding) as fp:
allText = fp.read()
# logging.debug("Complete load text from %s", fullFilename)
return allText
def isFileExistAndValid(filePath, fullFileSize=None):
"""Check file exist and valid or not
Args:
filePath (str): file path
fullFileSize (int): full file size
Returns:
existed and valid (bool)
Raises:
Examples:
"""
isExistFile = os.path.isfile(filePath)
isValidFile = False
if isExistFile:
curFileSize = os.path.getsize(filePath) # 260900226
if fullFileSize:
isValidFile = curFileSize == fullFileSize
else:
isValidFile = curFileSize > 0
isExistAndValid = isExistFile and isValidFile
return isExistAndValid
def floatSecondsToDatetimeDict(floatSeconds):
"""
convert float seconds(time delta) to datetime dict{days, hours, minutes, seconds, millseconds, microseconds}
example: 96400.3765293 -> {'days': 1, 'hours': 2, 'minutes': 46, 'seconds': 40, 'millseconds': 376, 'microseconds': 529}
"""
secondsInt = int(floatSeconds)
decimalsFloat = floatSeconds - secondsInt
millisecondsFloat = decimalsFloat * 1000
millisecondsInt = int(millisecondsFloat)
microsecondsDecimal = millisecondsFloat - millisecondsInt
microsecondsInt = int(microsecondsDecimal * 1000)
minutes, seconds = divmod(secondsInt, 60)
hours, minutes = divmod(minutes, 60)
days, hours = divmod(hours, 24)
convertedDict = {
"days": days,
"hours": hours,
"minutes": minutes,
"seconds": seconds,
"millseconds": millisecondsInt,
"microseconds": microsecondsInt,
}
return convertedDict
def datetimeDictToStr(datetimeDict,
seperatorD=" ",
seperatorHms=":",
seperatorMilliS=".",
isShowZeroDayStr=False,
isShowMilliSecPart=True,
):
"""Convert date time dict into date time string
Args:
datetimeDict (dict): date time dict
seperatorD (str): day seperator
seperatorHms (str): hour/minute/second seperator
seperatorMilliS (str): milli seconds seperator
isShowZeroDayStr (bool): whether show days string when days=0
isShowMilliSecPart (bool): whether show milli seconds part
Returns:
str
Raises:
Examples:
input:
{'days': 0, 'hours': 0, 'microseconds': 986, 'millseconds': 804, 'minutes': 3, 'seconds': 38}
{'hours': 0, minutes': 3, 'seconds': 38}
output:
'0 00:03:38.804'
'00:03:38'
"""
dayStr = ""
hasDays = "days" in datetimeDict
if hasDays:
days = datetimeDict["days"]
if (not isShowZeroDayStr) and (days == 0):
dayStr = ""
else:
dayStr = "%d%s" % (days, seperatorD)
hmsStr = "%02d%s%02d%s%02d" % (datetimeDict["hours"], seperatorHms, datetimeDict["minutes"], seperatorHms, datetimeDict["seconds"]) # '00:03:12'
milliSecStr = ""
hasMilliSec = "millseconds" in datetimeDict
if hasMilliSec:
if isShowMilliSecPart:
milliSecStr = "%s%03d" % (seperatorMilliS, datetimeDict["millseconds"])
formattedStr = "%s%s%s" % (dayStr, hmsStr, milliSecStr) # '00:03:12'
return formattedStr
def formatSize(sizeInBytes, decimalNum=1, isUnitWithI=False, sizeUnitSeperator=""):
"""
format size to human readable string
example:
3746 -> 3.7KB
87533 -> 85.5KiB
98654 -> 96.3 KB
352 -> 352.0B
76383285 -> 72.84MB
763832854988542 -> 694.70TB
763832854988542665 -> 678.4199PB
refer:
https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
"""
# https://en.wikipedia.org/wiki/Binary_prefix#Specific_units_of_IEC_60027-2_A.2_and_ISO.2FIEC_80000
# K=kilo, M=mega, G=giga, T=tera, P=peta, E=exa, Z=zetta, Y=yotta
sizeUnitList = ['','K','M','G','T','P','E','Z']
largestUnit = 'Y'
if isUnitWithI:
sizeUnitListWithI = []
for curIdx, eachUnit in enumerate(sizeUnitList):
unitWithI = eachUnit
if curIdx >= 1:
unitWithI += 'i'
sizeUnitListWithI.append(unitWithI)
# sizeUnitListWithI = ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']
sizeUnitList = sizeUnitListWithI
largestUnit += 'i'
suffix = "B"
decimalFormat = "." + str(decimalNum) + "f" # ".1f"
finalFormat = "%" + decimalFormat + sizeUnitSeperator + "%s%s" # "%.1f%s%s"
sizeNum = sizeInBytes
for sizeUnit in sizeUnitList:
if abs(sizeNum) < 1024.0:
return finalFormat % (sizeNum, sizeUnit, suffix)
sizeNum /= 1024.0
return finalFormat % (sizeNum, largestUnit, suffix)
def getFileSizeFromUrl(fileUrl, proxies=None):
"""Get file size from file url
Args:
fileUrl (str): file url
proxies (dict): requests proxies
Returns:
file size or 0 mean fail to get
Raises:
Examples:
input: https://gameapktxdl.vivo.com.cn/appstore/developer/soft/20201020/202010201805243ed5v.apk
output: 154551625
"""
totalFileSize = None
try:
resp = requests.get(fileUrl, stream=True, proxies=proxies)
respHeaders = resp.headers
# {'Date': 'Thu, 10 Dec 2020 05:27:10 GMT', 'Content-Type': 'application/vnd.android.package-archive', 'Content-Length': '154551625', 'Connection': 'keep-alive', 'Server': 'NWS_TCloud_static_msoc1_xz', 'Cache-Control': 'max-age=600', 'Expires': 'Thu, 10 Dec 2020 05:37:09 GMT', 'Last-Modified': 'Thu, 09 Jan 2020 11:21:35 GMT', 'X-NWS-UUID-VERIFY': '94db2d14f135898d924fb249b13a0964', 'X-Verify-Code': '2871bd7acf67c7e298e9c8d8c865e27d', 'X-NWS-LOG-UUID': 'a83536f2-ab83-465d-ba09-0e19a15cc706', 'X-Cache-Lookup': 'Hit From Disktank3, Hit From Inner Cluster', 'Accept-Ranges': 'bytes', 'ETag': '"46C50A5CADB6BEE339236477BB6DDC14"', 'X-Daa-Tunnel': 'hop_count=2'}
# {'Server': 'Tengine', 'Date': 'Fri, 11 Dec 2020 14:11:00 GMT', 'Content-Type': 'application/pdf', 'Content-Length': '24422168', 'Last-Modified': 'Fri, 18 Sep 2020 09:56:15 GMT', 'Connection': 'keep-alive', 'ETag': '"5f64843f-174a718"', 'Strict-Transport-Security': 'max-age=15768000', 'Accept-Ranges': 'bytes'}
contentLengthStr = respHeaders['Content-Length'] # '154551625', '24422168'
contentLengthInt = int(contentLengthStr) # 154551625, 24422168
totalFileSize = contentLengthInt
except:
totalFileSize = None
return totalFileSize # 154551625
def streamingDownloadFile(
url,
fileToSave=None,
proxies=None,
isShowSpeed=True,
chunkSize=1024*512,
resumeSize=0,
totalSize=0,
):
"""Download file using stream mode, support showing process with current speed, percent, size
Args:
url (str): file online url
fileToSave (str): filename or full file path
proxies (dict): requests proxies
isShowSpeed (bool): show downloading speed or not
chunkSize (int): when showing download speed, need use stream downloading, need set chunck size
resumeSize (bool): the size to start download, normally is the local already downloaded size
totalSize (int): total file size, only used for calculate downloaded percent
Returns:
download ok or not (bool)
Raises:
Examples:
"""
isDownloadOk = False
if isShowSpeed:
if totalSize == 0:
gotTotalSize = getFileSizeFromUrl(url, proxies) # 154551625
if gotTotalSize:
totalSize = gotTotalSize
headers = {
'Range': 'bytes=%d-' % resumeSize,
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
}
resp = requests.get(url, proxies=proxies, headers=headers, stream=True)
curDownloadedSize = 0
with open(fileToSave, "ab") as f:
startTime = time.time()
prevTime = startTime
for curChunkBytes in resp.iter_content(chunk_size=chunkSize):
if curChunkBytes:
curTime = time.time() # 1606456020.0718982
f.write(curChunkBytes)
f.flush()
curChunkSize = len(curChunkBytes) # 524288
curDownloadedSize += curChunkSize # 524288
totalDownloadedSize = curDownloadedSize + resumeSize # 12058624
totalDownloadedSizeStr = formatSize(totalDownloadedSize) # '11.5MB'
curDownloadTime = curTime - prevTime # 15.63818907737732
curSpeed = curChunkSize / curDownloadTime # 670522.651191692
curSpeedStr = formatSize(curSpeed) # '231.3KB'
totalDownloadTime = curTime - startTime # 15.63818907737732
averageSpeed = curDownloadedSize / totalDownloadTime # 670522.651191692
averageSpeedStr = formatSize(averageSpeed) # '231.3KB'
totalDownloadTimeDict = floatSecondsToDatetimeDict(totalDownloadTime)
totalDownloadTimeStr = datetimeDictToStr(totalDownloadTimeDict, isShowMilliSecPart=False)
if isShowSpeed:
showStr = "downloading speed: cur=%s/s, avg=%s/s, time: total=%s, size: %s" % (curSpeedStr, averageSpeedStr, totalDownloadTimeStr, totalDownloadedSizeStr)
if totalSize > 0:
downloadedPercent100 = round(100 * totalDownloadedSize / totalSize, 2) # 47.23
downloadedPercent100Str = str(downloadedPercent100) # '47.23'
percentStr = ", percent: %s%%" % downloadedPercent100Str # ', percent: 47.23%'
else:
percentStr = ""
showStr += percentStr
# 'downloading speed: cur=231.3KB/s, avg=231.3KB/s, time: total=00:00:02, size: 11.5MB, percent: 49.38%'
print(showStr)
prevTime = curTime
return isDownloadOk
def downloadFile(url,
fileToSave=None,
proxies=None,
isStreamMode=True,
isResume=True,
):
"""Download file from url then save to file
Args:
url (str): file online url
fileToSave (str): filename or full file path
proxies (dict): requests proxies
isStreamMode (bool): use stream mode or not
Returns:
download ok or not (bool)
Raises:
Examples:
input:
'https://book.crifan.org/books/5g_message_rcs_tech_summary/pdf/5g_message_rcs_tech_summary.pdf'
'downloaded/pdf/5g_message_rcs_tech_summary.pdf'
output:
True
"""
isDownloadOk = False
if not fileToSave:
urlPartList = url.split("/")
fileToSave = urlPartList[-1]
try:
if isStreamMode:
totalFileSize = getFileSizeFromUrl(url, proxies) # 154551625
if not totalFileSize:
print("Failed to get total file size from %s" % url)
return isDownloadOk
totalSizeStr = formatSize(totalFileSize)
print("Get total file size %s from %s" % (totalSizeStr, url))
isDownloadedAndValid = isFileExistAndValid(fileToSave, fullFileSize=totalFileSize)
if isDownloadedAndValid:
print("%s is already download" % fileToSave)
isDownloadOk = True
return isDownloadOk
curDownloadedSize = 0
isExistFile = os.path.isfile(fileToSave)
if isExistFile:
curDownloadedSize = os.path.getsize(fileToSave)
curDownloadedSizeStr = formatSize(curDownloadedSize)
print("Already downloaded %s for %s" % (curDownloadedSizeStr, fileToSave))
if curDownloadedSize > totalFileSize:
# possible is local is new version, so consider as downloaded
print("Downloaded=%s > online=%s, consider as downloaded" % (curDownloadedSizeStr, totalSizeStr))
isDownloadOk = True
return isDownloadOk
if not isResume:
curDownloadedSize = 0
isDownloadOk = streamingDownloadFile(
url,
fileToSave=fileToSave,
proxies=proxies,
isShowSpeed=True,
resumeSize=curDownloadedSize,
totalSize=totalFileSize,
)
else:
resp = requests.get(url, proxies=proxies)
with open(fileToSave, 'wb') as saveFp:
saveFp.write(resp.content)
isDownloadOk = True
except BaseException as curException:
print("Exception %s when download %s to %s" % (curException, url, fileToSave))
return isDownloadOk
################################################################################
# Const & Config & Settings
################################################################################
InputMdFile = "README.md"
################################################################################
# Main
################################################################################
mdStr = loadTextFromFile(InputMdFile)
# print("mdStr=%s" % mdStr)
allPdfUrlList = []
# * [Notepad++](https://www.crifan.org/files/doc/docbook/rec_soft_npp/release/html/rec_soft_npp.html)
# * [硬件电路基础知识](https://www.crifan.org/files/doc/docbook/hardware_basic/release/html/hardware_basic.html)
# allDocbookIter = re.finditer("https?://www\.crifan\.com/files/doc/docbook/(?P<bookName>\w+)/release/", mdStr)
# <callable_iterator object at 0x1094acfd0>
# allDocbookList = list(allDocbookIter)
# [<re.Match object; sp...soft_dev_>, <re.Match object; sp...rec_soft_>, <re.Match object; sp...programmi>, <re.Match object; sp...language_>, <re.Match object; sp...json_tuto>, <re.Match object; sp...char_enco>, <re.Match object; sp...char_enco>, <re.Match object; sp...python_to>, <re.Match object; sp...regular_e>, <re.Match object; sp...python_to>, <re.Match object; sp...csharp_su>, <re.Match object; sp...build_web>, <re.Match object; sp...website_t>, <re.Match object; sp...web_scrap>, ...]
# allDocbookList = re.findall("https?://www\.crifan\.com/files/doc/docbook/\w+/release/", mdStr)
allDocbookNameList = re.findall("https?://www\.crifan\.com/files/doc/docbook/(\w+)/release/", mdStr)
# ['char_encoding_usage', 'crifanlib_python', ...]
docbookTotalNum = len(allDocbookNameList)
print("docbookTotalNum=%s" % docbookTotalNum) # 113
uniqueDocbookNameSet = set(allDocbookNameList)
uniqueDocbookNameList = list(uniqueDocbookNameSet)
uniqueDocbookTotalNum = len(uniqueDocbookNameList)
print("uniqueDocbookTotalNum=%s" % uniqueDocbookTotalNum) # 56
for curIdx, eachDocbookName in enumerate(uniqueDocbookNameList):
curNum = curIdx + 1
curDocbookPdfUrl = "https://www.crifan.org/files/doc/docbook/%s/release/pdf/%s.pdf" % (eachDocbookName, eachDocbookName)
# print("[%d] curDocbookPdfUrl=%s" % (curNum, curDocbookPdfUrl))
# https://www.crifan.org/files/doc/docbook/rec_soft_npp/release/pdf/rec_soft_npp.pdf
# https://www.crifan.org/files/doc/docbook/soft_dev_basic/release/pdf/soft_dev_basic.pdf
allPdfUrlList.append(curDocbookPdfUrl)
# * [风格](https://book.crifan.org/books/program_code_style/website)
# * [VSCode](http://book.crifan.org/books/best_editor_vscode/website)
allGitbookNameList = re.findall("https?://book\.crifan\.com/books/(\w+)/website", mdStr)
gitbooNameTotalNum = len(allGitbookNameList)
print("gitbooNameTotalNum=%s" % gitbooNameTotalNum) # 139
uniqueGitbookNameSet = set(allGitbookNameList)
uniqueGitbookNameList = list(uniqueGitbookNameSet)
uniqueGitbookNameTotalNum = len(uniqueGitbookNameList)
print("uniqueGitbookNameTotalNum=%s" % uniqueGitbookNameTotalNum) # 71
for curIdx, eachGitbookName in enumerate(uniqueGitbookNameList):
curNum = curIdx + 1
curGitbookPdfUrl = "https://book.crifan.org/books/%s/pdf/%s.pdf" % (eachGitbookName, eachGitbookName)
# print("[%d] curGitbookPdfUrl=%s" % (curNum, curGitbookPdfUrl))
# https://book.crifan.org/books/best_editor_vscode/pdf/best_editor_vscode.pdf
# https://book.crifan.org/books/scientific_network_summary/pdf/scientific_network_summary.pdf
allPdfUrlList.append(curGitbookPdfUrl)
allPdfUrlList.sort()
pdfUrlTotalNum = len(allPdfUrlList)
print("pdfUrlTotalNum=%s" % pdfUrlTotalNum) # 127
# download all pdf url
for curUrlIdx, eachPdfUrl in enumerate(allPdfUrlList):
curUrlNum = curUrlIdx + 1
# 'https://book.crifan.org/books/5g_message_rcs_tech_summary/pdf/5g_message_rcs_tech_summary.pdf'
pdfFilename = eachPdfUrl.split("/")[-1]
# '5g_message_rcs_tech_summary.pdf'
print("%s [%2d/%2d] %s %s" % ("-"*30, curUrlNum, pdfUrlTotalNum, pdfFilename ,"-"*30))
print("%s" % eachPdfUrl)
downloadedFilePath = os.path.join(OutputFolder, pdfFilename)
isDownloadOk = downloadFile(eachPdfUrl, downloadedFilePath, proxies=RequestsProxies)
print("%s to downloaded %s from %s" % (isDownloadOk, pdfFilename, eachPdfUrl))相关log输出:
已下载的:
------------------------------ [125/127] vmware_tutorial.pdf ------------------------------ https://www.crifan.org/files/doc/docbook/vmware_tutorial/release/pdf/vmware_tutorial.pdf Get total file size 248.2KB from https://www.crifan.org/files/doc/docbook/vmware_tutorial/release/pdf/vmware_tutorial.pdf downloaded/pdf/vmware_tutorial.pdf is already download True to downloaded vmware_tutorial.pdf from https://www.crifan.org/files/doc/docbook/vmware_tutorial/release/pdf/vmware_tutorial.pdf
本身不存在book的pdf,所以下载失败的:
------------------------------ [66/127] test_automation_overview.pdf ------------------------------ https://book.crifan.org/books/test_automation_overview/pdf/test_automation_overview.pdf Failed to get total file size from https://book.crifan.org/books/test_automation_overview/pdf/test_automation_overview.pdf False to downloaded test_automation_overview.pdf from https://book.crifan.org/books/test_automation_overview/pdf/test_automation_overview.pdf
已下载部分,断点续传,继续下载的:
------------------------------ [ 1/127] 5g_message_rcs_tech_summary.pdf ------------------------------ https://book.crifan.org/books/5g_message_rcs_tech_summary/pdf/5g_message_rcs_tech_summary.pdf Get total file size 23.3MB from https://book.crifan.org/books/5g_message_rcs_tech_summary/pdf/5g_message_rcs_tech_summary.pdf Already downloaded 11.5MB for downloaded/pdf/5g_message_rcs_tech_summary.pdf downloading speed: cur=554.6KB/s, avg=554.6KB/s, time: total=00:00:00, size: 12.0MB, percent: 51.52% downloading speed: cur=847.2KB/s, avg=670.4KB/s, time: total=00:00:01, size: 12.5MB, percent: 53.67% ... downloading speed: cur=146.5KB/s, avg=351.5KB/s, time: total=00:00:33, size: 23.0MB, percent: 98.75% downloading speed: cur=118.6KB/s, avg=335.3KB/s, time: total=00:00:36, size: 23.3MB, percent: 100.0% False to downloaded 5g_message_rcs_tech_summary.pdf from https://book.crifan.org/books/5g_message_rcs_tech_summary/pdf/5g_message_rcs_tech_summary.pdf
下载的pdf文件:

最后:
完整代码详见:
供参考。
转载请注明:在路上 » 【已解决】写脚本下载crifan的所有电子书