最新消息:20210917 已从crifan.com换到crifan.org

【已解决】PySpider中用Python代码爬取小花生app中绘本数据

app crifan 1094浏览 0评论
折腾:
【记录】爬取小花生app中自主阅读馆和亲子阅读馆中的有音频的绘本数据
期间,已经大概分析了绘本数据的相关api。
现在需要去写代码模拟下载数据。
先确保第一个获取绘本列表的api能够正常获取数据。
期间:
【已解决】mac中PySpider运行出错:Deprecated option domaincontroller use http_authenticator.domain_controller instead
继续写代码。
结果:
【已解决】PySpider模拟请求小花生api接口出错:requests.exceptions.HTTPError HTTP 500 Internal Server Error
接着就可以写代码去爬取数据了。
接着又遇到:
【已解决】PySpider模拟小花生app请求parentChildReadingBookQuery2返回空数据
以及:
【已解决】小花生app中调用接口parentChildReadingBookQuery2时timestamp和signature生成的逻辑
然后接着去模拟剩下的selfReadingBookQuery2
【已解决】PySpider无法继续爬取剩余绘本数据
然后继续参考:
【已解决】用Charles+Postman+Python解密脚本分析小花生app中绘本接口和返回信息
实现剩余的api请求。
然后基本上写好了代码,且也优化好了:
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2019-03-27 15:35:20
# Project: XiaohuashengApp


from pyspider.libs.base_handler import *


import os
import json
import codecs
import base64
import gzip
import copy
import time
import re
# import datetime
from datetime import datetime, timedelta
from hashlib import md5


######################################################################
# Const
######################################################################


gServerPort = "http://www.xiaohuasheng.cn:83"
gResourcesRoot = "https://img.xiaohuasheng.cn"


SelfReadingUrl = "http://www.xiaohuasheng.cn:83/Reading.svc/selfReadingBookQuery2"
ParentChildReadingUrl = "http://www.xiaohuasheng.cn:83/Reading.svc/parentChildReadingBookQuery2"
# ViewEnglishSeries2UrlPrefix = "http://www.xiaohuasheng.cn:83/Reading.svc/viewEnglishSeries2"


RESPONSE_OK = "1001"


######################################################################
# Config & Settings
######################################################################


OutputFolder = "/Users/crifan/dev/dev_root/company/xxx/projects/crawler_projects/crawler_xiaohuasheng_app/output"


DefaultPageSize = 10


gUserAgentNoxAndroid = "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-cn; A0001 Build/KOT49H) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"


gUserId = "1134723"
gAuthorization = """NSTp9~)NwSfrXp@\\"""


gUserToken = "40d2267f-359e-4526-951a-66519e5868c3"
gSecretKey = “AyGt7ohMR!xx#N"


gHeaders = {
    "Host": "www.xiaohuasheng.cn:83",
    "User-Agent": gUserAgentNoxAndroid,
    "Content-Type": "application/json",


    "userId": gUserId,
    "Authorization": gAuthorization,
    # "timestamp": gTimestamp,
    # "signature": gSignature,


    "cookie": "ASP.NET_SessionId=dxf3obxgn5t4w350xp3icgy0",
    # "Cookie2": "$Version=1",
    "Accept": "*/*",
    "Accept-Encoding": "gzip, deflate",
    "cache-control": "no-cache",


    "Connection": "keep-alive",
    # "content-length": "202",
}


gParamLevelAll = -1
gFixParam1 = 1


gLongitude = "120.136174"
gLatitude = "28.997280"
gFixParam2 = 10


######################################################################
# Common Util Functions
######################################################################


def getCurTimestamp(withMilliseconds=False):
    """
    get current time's timestamp
        (default)not milliseconds -> 10 digits: 1351670162
        with milliseconds -> 13 digits: 1531464292921
    """
    curDatetime = datetime.now()
    return datetimeToTimestamp(curDatetime, withMilliseconds)


def datetimeToTimestamp(datetimeVal, withMilliseconds=False) :
    """
        convert datetime value to timestamp
        eg:
            "2006-06-01 00:00:00.123" -> 1149091200
            if with milliseconds -> 1149091200123
    :param datetimeVal:
    :return:
    """
    timetupleValue = datetimeVal.timetuple()
    timestampFloat = time.mktime(timetupleValue) # 1531468736.0 -> 10 digits
    timestamp10DigitInt = int(timestampFloat) # 1531468736
    timestampInt = timestamp10DigitInt


    if withMilliseconds:
        microsecondInt = datetimeVal.microsecond # 817762
        microsecondFloat = float(microsecondInt)/float(1000000) # 0.817762
        timestampFloat = timestampFloat + microsecondFloat # 1531468736.817762
        timestampFloat = timestampFloat * 1000 # 1531468736817.7621 -> 13 digits
        timestamp13DigitInt = int(timestampFloat) # 1531468736817
        timestampInt = timestamp13DigitInt


    return timestampInt


def extractSuffix(fileNameOrUrl):
    """
    extract file suffix from name or url
    eg:
        https://cdn2.xxx.cn/2018-09-10/15365514898246.mp4 -> mp4
        15365514894833.srt -> srt
    """
    return fileNameOrUrl.split('.')[-1]


def createFolder(folderFullPath):
    """
        create folder, even if already existed
        Note: for Python 3.2+
    """
    os.makedirs(folderFullPath, exist_ok=True)
    print("Created folder: %s" % folderFullPath)


def saveDataToFile(fullFilename, binaryData):
    """save binary data info file"""
    with open(fullFilename, 'wb') as fp:
        fp.write(binaryData)
        fp.close()
        print("Complete save file %s" % fullFilename)


def saveJsonToFile(fullFilename, jsonValue):
    """save json dict into file"""
    with codecs.open(fullFilename, 'w', encoding="utf-8") as jsonFp:
        json.dump(jsonValue, jsonFp, indent=2, ensure_ascii=False)
        print("Complete save json %s" % fullFilename)


def loadJsonFromFile(fullFilename):
    """load and parse json dict from file"""
    with codecs.open(fullFilename, 'r', encoding="utf-8") as jsonFp:
        jsonDict = json.load(jsonFp)
        print("Complete load json from %s" % fullFilename)
        return jsonDict


######################################################################
# Project Specific Functions
######################################################################


def getSeriesFolder(seriesId):
    return os.path.abspath(os.path.join(OutputFolder, "series", str(seriesId)))


def getSeriesAudioPackagesFolder(seriesId):
    return os.path.abspath(os.path.join(getSeriesFolder(seriesId), "AudioPackages"))


def getSeriesBooksFolder(seriesId):
    return os.path.abspath(os.path.join(getSeriesFolder(seriesId), "Books"))


def getSingleAudioPackageFolder(seriesId, audioPackageId):
    return os.path.abspath(os.path.join(getSeriesAudioPackagesFolder(seriesId), str(audioPackageId)))


def getSingleAudioFolder(seriesId, audioPackageId, audioId):
    return os.path.abspath(os.path.join(getSingleAudioPackageFolder(seriesId, audioPackageId), str(audioId)))


def getSingleBookFolder(seriesId, bookId):
    return os.path.abspath(os.path.join(getSeriesBooksFolder(seriesId), str(bookId)))


######################################################################
# Main
######################################################################


class Handler(BaseHandler):
    crawl_config = {
    }


    #----------------------------------------
    # Util Functions
    #----------------------------------------


    def downloadFileCallback(self, response):
        fileInfo = response.save
        print("fileInfo=%s" % fileInfo)


        binData = response.content
        fileFullPath = os.path.join(fileInfo["saveFolder"], fileInfo["filename"])
        print("fileFullPath=%s" % fileFullPath)
        saveDataToFile(fileFullPath, binData)


    def downloadFile(self, fileInfo):
        urlToDownload = fileInfo["fileUrl"]
        print("urlToDownload=%s" % urlToDownload)
        self.crawl(urlToDownload,
            callback=self.downloadFileCallback,
            save=fileInfo)


    def generateSignature(self, timestampInt, jValueOrUrlEndpoint):
        # print("generateSignature: timestampInt=%d, jValueOrUrlEndpoint=%s" % (timestampInt, jValueOrUrlEndpoint))
        # userId = "1134723"
        userId = gUserId
        timestamp = "%s" % timestampInt
        # localObject = "/Reading.svc/parentChildReadingBookQuery2"
        # localObject = jValueOrUrlEndpoint
        # userToken = "40d2267f-359e-4526-951a-66519e5868c3"
        userToken = gUserToken
        # fixedSault = “AyGt7ohMR!xx#N"
        # secretKey = “AyGt7ohMR!xx#N"
        secretKey = gSecretKey


        # strToCalc = userId + timestamp + localObject + jValueOrUrlEndpoint + fixedSault
        # strToCalc = timestamp + localObject + fixedSault
        strToCalc = userId + timestamp + jValueOrUrlEndpoint + userToken + secretKey
        # print("strToCalc=%s" % strToCalc)


        encodedStr = strToCalc.encode()
        # encodedStr = strToCalc.encode("UTF-8")
        # print("encodedStr=%s" % encodedStr)
        md5Result = md5(encodedStr)
        # print("md5Result=%s" % md5Result) # md5Result=<md5 HASH object @ 0x1044f1df0>


        # md5Result = md5()
        # md5Result.update(strToCalc)


        # md5Digest = md5Result.digest()
        # print("md5Digest=%s" % md5Digest) #
        # print("len(md5Digest)=%s" % len(md5Digest))


        md5Hexdigest = md5Result.hexdigest()
        # print("md5Hexdigest=%s" % md5Hexdigest)
        # print("len(md5Hexdigest)=%s" % len(md5Hexdigest))
        # md5Hexdigest=c687d5dfa015246e6bdc6b3c27c2afea
        # print("md5=%s from %s" % (md5Hexdigest, strToCalc))
        return md5Hexdigest
        # return md5Digest


    def extractResponseData(self, respJson):
        """
        {
            "C": 2,
            "J": "H4sIAA.......AA=",
            "M": "1001",
            "ST": null
        }
        """
        # respJson = json.loads(respJson)
        respM = respJson["M"]
        if respM != RESPONSE_OK:
            return None
        encodedStr = respJson["J"]
        decodedStr = base64.b64decode(encodedStr)
        # print("decodedStr=%s" % decodedStr)


        decompressedStr = gzip.decompress(decodedStr)
        # print("decompressedStr=%s" % decompressedStr)
        decompressedStrUnicode = decompressedStr.decode("UTF-8")
        # print("decompressedStrUnicode=%s" % decompressedStrUnicode)
        decompressedJson = json.loads(decompressedStrUnicode)
        respDataDict = decompressedJson
        return respDataDict


    def generateCurrentHeaders(self, jValueOrUrlEndpoint):
        curHeaders = copy.deepcopy(gHeaders)
        curTimestampInt = getCurTimestamp()


        curTimestampStr = str(curTimestampInt)
        curHeaders["timestamp"] = curTimestampStr


        curSignature = self.generateSignature(curTimestampInt, jValueOrUrlEndpoint)
        curHeaders["signature"] = curSignature


        return curHeaders


    def dictValueStrToJson(self, originDict):
        """
            auto detect json filed name is xxxJson or xxxArrayJson, then convert json str to dict/json
        """
        processedDict = originDict
        if isinstance(processedDict, dict):
            firstLevelKeys = processedDict.keys()
            for eachFieldName in firstLevelKeys:
                isArrayJson = re.match(r"\w+ArrayJson$", eachFieldName)
                isJson = re.match(r"\w+Json$", eachFieldName)
                # print("isArrayJson=%s, isJson=%s" % (isArrayJson, isJson))
                if isArrayJson or isJson:
                    fieldValueJsonStr = processedDict[eachFieldName]
                    # print("%s -> fieldValueJsonStr=%s" % (eachFieldName, fieldValueJsonStr))
                    if fieldValueJsonStr:
                        fieldValueDict = json.loads(fieldValueJsonStr)
                    else:
                        fieldValueDict = None
                    fieldValueDict = self.dictValueStrToJson(fieldValueDict)
                    processedDict[eachFieldName] = fieldValueDict
        elif isinstance(originDict, list):
            newList = []
            for eachItem in originDict:
                processedItem = self.dictValueStrToJson(eachItem)
                newList.append(processedItem)
            processedDict = newList


        return processedDict


    #----------------------------------------
    # Crawl Logic
    #----------------------------------------


    def on_start(self):
        jValueTemplateSelfReading = "{\"userId\":\"%s\",\"fieldName\":\"\",\"fieldValue\":\"全部类别\",\"grades\":\"\",\"levels\":\"\",\"supportingResources\":\"有音频\",\"offset\":%d,\"limit\":%d}"
        jValueTemplateParentChildReading = "{\"userId\":\"%s\",\"fieldName\":\"\",\"fieldValue\":\"全部类别\",\"theStageOfTheChild\":\"\",\"parentalEnglishLevel\":\"\",\"supportingResources\":\"有音频\",\"offset\":%d,\"limit\":%d}"


        paramDictSelfReading = {
            "curUrl": SelfReadingUrl,
            "offset": 0,
            "limit": DefaultPageSize,
            "jValueTemplate": jValueTemplateSelfReading
        }
        self.getBookQuery2(paramDictSelfReading)


        paramDictParentChildReading = {
            "curUrl": ParentChildReadingUrl,
            "offset": 0,
            "limit": DefaultPageSize,
            "jValueTemplate": jValueTemplateParentChildReading
        }
        self.getBookQuery2(paramDictParentChildReading)


    def getBookQuery2(self, curParamDict):
        print("getBookQuery2: curParamDict=%s" % curParamDict)


        curUrl = curParamDict["curUrl"]
        jValueTemplate = curParamDict["jValueTemplate"]
        offset = curParamDict["offset"]
        limit = curParamDict["limit"]


        jValueStr = jValueTemplate % (gUserId, offset, limit)
        jcJsonDict = {
            "J": jValueStr,
            "C": 0
        }
        jcJsonDictStr = json.dumps(jcJsonDict)


        curParamDict["jValueStr"] = jValueStr
        curParamDict["jcJsonDict"] = jcJsonDict
        curParamDict["jcJsonDictStr"] = jcJsonDictStr


        curHeaders = self.generateCurrentHeaders(jValueStr)


        # add hash value for url to force re-crawl when POST url not changed
        timestampStr = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
        curUrlWithHash = curUrl + "#" + timestampStr


        fakeItagForceRecrawl = "%s_%s_%s" % (timestampStr, offset, limit)


        self.crawl(curUrlWithHash,
            itag=fakeItagForceRecrawl, # To force re-crawl for next page
            method="POST",
            # data=jcJsonDict,
            data= jcJsonDictStr,
            # callback=curCallback,
            callback=self.getBookQuery2Callback,
            headers=curHeaders,
            save=curParamDict
        )


    def getBookQuery2Callback(self, response):
        respUrl = response.url
        print("respUrl=%s" % respUrl)
        prevParaDict = response.save
        print("prevParaDict=%s" % prevParaDict)
        respJson = response.json
        print("respJson=%s" % respJson)


        respData = self.extractResponseData(respJson)
        print("respData=%s" % respData)


        if respData:
            newOffset = prevParaDict["offset"] + prevParaDict["limit"]
            prevParaDict["offset"] = newOffset
            self.getBookQuery2(prevParaDict)


            bookSeriesList = respData
            for eachBookSerie in bookSeriesList:
                print("eachBookSerie=%s" % eachBookSerie)
                self.getStorybookDetail(eachBookSerie)
        else:
            print("!!! %s return no more data: %s" % (response.url, respJson))


    def getStorybookDetail(self, bookSerieDict):
        print("getStorybookDetail: bookSerieDict=%s" % bookSerieDict)


        seriePrimayKey = bookSerieDict["pk"]


        urlEndpoint = "/Reading.svc/viewEnglishSeries2/%s/%s" % (gUserId, seriePrimayKey)
        fullUrl = "%s%s" % (gServerPort, urlEndpoint)
        # http://www.xiaohuasheng.cn:83/Reading.svc/viewEnglishSeries2/1134723/31
        print("urlEndpoint=%s, fullUrl=%s" % (urlEndpoint, fullUrl))


        curHeaders = self.generateCurrentHeaders(urlEndpoint)


        self.crawl(fullUrl,
            method="GET",
            callback=self.getSerieDetailCallback,
            headers=curHeaders,
            # save=bookSerieDict
        )


    def getSerieDetailCallback(self, response):
        respUrl = response.url
        print("respUrl=%s" % respUrl)
        # bookSerieDict = response.save
        # print("bookSerieDict=%s" % bookSerieDict)
        respJson = response.json
        print("respJson=%s" % respJson)


        respData = self.extractResponseData(respJson)
        print("respData=%s" % respData)


        respDict = respData[0]
        # respDict["url"] = response.url
        # return respDict


        bookSeriesDict = respDict
        seriesId = bookSeriesDict["pk"]


        self.saveSeriesInfo(bookSeriesDict)


        # get audio
        audioPackagesParamDict = {
            "seriesId": seriesId,
            "level": gParamLevelAll,
            "fixParam1": gFixParam1,
            "offset": 0,
            "limit": DefaultPageSize
        }
        self.getSeriesAudioPackages(audioPackagesParamDict)


        # get book info
        bookParamDict = {
            "seriesId": seriesId,
            "level": gParamLevelAll,
            "offset": 0,
            "limit": DefaultPageSize
        }
        self.getSeriesBook(bookParamDict)


    def getSeriesBook(self, paramDict):
        urlEndpoint = "/Reading.svc/queryEnglishSeriesBook/%s/%s/%s/%s/%s" % \
            (gUserId, paramDict["seriesId"], paramDict["level"], paramDict["offset"], paramDict["limit"])
        print("urlEndpoint=%s" % urlEndpoint)
        fullUrl = "%s%s" % (gServerPort, urlEndpoint)
        # http://www.xiaohuasheng.cn:83/Reading.svc/queryEnglishSeriesBook/1134723/31/-1/0/10


        curHeaders = self.generateCurrentHeaders(urlEndpoint)


        self.crawl(fullUrl,
            method="GET",
            callback=self.getSeriesBookCallback,
            headers=curHeaders,
            save=paramDict,
        )


    def getSeriesBookCallback(self, response):
        respUrl = response.url
        print("respUrl=%s" % respUrl)
        respJson = response.json
        print("respJson=%s" % respJson)
        respData = self.extractResponseData(respJson)
        print("respData=%s" % respData)


        if respData:
            prevParamDict = response.save
            curParamDict = prevParamDict
            curParamDict["offset"] += curParamDict["limit"]
            self.getSeriesBook(curParamDict)


            seriesId = curParamDict["seriesId"]
            seriesBookList = respData
            print("seriesBookList=%s" % seriesBookList)
            for eachBookDict in seriesBookList:
                print("eachBookDict=%s" % eachBookDict)
                curBookId = eachBookDict["pk"]
                self.getSingleBookInfo(seriesId, curBookId)
        else:
            print("!!! %s return no more data: %s" % (response.url, respJson))


    def getSingleBookInfo(self, seriesId, curBookId):
        urlEndpoint = "/Reading.svc/getServerBookInfo17/%s/%s/%s/%s/%s" % \
            (gUserId, gLongitude, gLatitude, curBookId, gFixParam2)
        print("urlEndpoint=%s" % urlEndpoint)
        fullUrl = "%s%s" % (gServerPort, urlEndpoint)
        # http://www.xiaohuasheng.cn:83/Reading.svc/getServerBookInfo17/1134723/120.136174/28.997280/109512/10


        curHeaders = self.generateCurrentHeaders(urlEndpoint)


        self.crawl(fullUrl,
            method="GET",
            callback=self.getSingleBookInfoCallback,
            headers=curHeaders,
            save=seriesId,
        )


    def getSingleBookInfoCallback(self, response):
        seriesId = response.save
        print("seriesId=%s" % seriesId)
        respUrl = response.url
        print("respUrl=%s" % respUrl)
        respJson = response.json
        print("respJson=%s" % respJson)
        respData = self.extractResponseData(respJson)
        print("respData=%s" % respData)


        bookInfoDict = respData[0]
        print("bookInfoDict=%s" % bookInfoDict)
        self.saveSingleBookInfo(seriesId, bookInfoDict)


    def saveSingleBookInfo(self, seriesId, bookInfoDict):
        # curSeriesBooksFolder = getSeriesBooksFolder(seriesId)
        # print("curSeriesBooksFolder=%s" % curSeriesBooksFolder)
        # createFolder(curSeriesBooksFolder)


        bookId = bookInfoDict["pk"]
        singleBooksFolder = getSingleBookFolder(seriesId, bookId)
        print("singleBooksFolder=%s" % singleBooksFolder)
        createFolder(singleBooksFolder)


        singleBookFilename = "series_%s_Books_%s_info.json" % (seriesId, bookId)
        singleBookFullPath = os.path.abspath(os.path.join(singleBooksFolder, singleBookFilename))
        bookInfoDict = self.dictValueStrToJson(bookInfoDict)
        saveJsonToFile(singleBookFullPath, bookInfoDict)


        # download and save: frontCover
        # "frontCover": "149/Book/20160930171033.png",
        coverImageUrlTail = bookInfoDict["frontCover"]
        if coverImageUrlTail:
            coverImageFilename = ("Books_%s_" % bookId) + coverImageUrlTail.replace("/", "_")
            imageFileInfo = {
                "fileUrl": gResourcesRoot + "/" + coverImageUrlTail,
                "filename": coverImageFilename,
                "saveFolder": singleBooksFolder,
            }
            self.downloadFile(imageFileInfo)


    def saveSeriesInfo(self, bookSeriesDict):
        seriesId = bookSeriesDict["pk"]
        curSeriesFolder = getSeriesFolder(seriesId)
        print("curSeriesFolder=%s" % curSeriesFolder)
        createFolder(curSeriesFolder)


        filenamePrefix = "series_%s" % seriesId
        seriesFilename = "%s_info.json" % filenamePrefix
        seriesFullPath = os.path.abspath(os.path.join(curSeriesFolder, seriesFilename))
        bookSeriesDict = self.dictValueStrToJson(bookSeriesDict)
        saveJsonToFile(seriesFullPath, bookSeriesDict)


        # download series cover image
        """
        /series/623/series_623_info.json
        {
            "pk": 623,
            "englishTitle": "Peppa Pig",
            "chineseTitle": "小猪佩奇绘本集",
            "picture": "System/EnglishSeriesPicture/20190114112209525.jpg",
        ...


        /series/158/series_158_info.json
        {
            "pk": 158,
            "englishTitle": "An Elephant and Piggie Book",
            "chineseTitle": "小猪小象绘本系列",
            ...
            "picture": "",
            "lessonPlanFirstPictureUrl": "https://img.xiaohuasheng.cn/20180911145347266_80f5f443a43bb430663a71b381cde40e.jpg",
        """
        fileUrl = None
        coverImageUrlTail = bookSeriesDict["picture"]
        lessonPlanFirstPictureUrl = bookSeriesDict["lessonPlanFirstPictureUrl"]
        if coverImageUrlTail:
            coverImageFilename = filenamePrefix + coverImageUrlTail.replace("/", "_")
            fileUrl = gResourcesRoot + "/" + coverImageUrlTail
        elif lessonPlanFirstPictureUrl:
            coverImageFilename = filenamePrefix + "_" + lessonPlanFirstPictureUrl.split("/")[-1]
            fileUrl = lessonPlanFirstPictureUrl


        if fileUrl:
            imageFileInfo = {
                "fileUrl": fileUrl,
                "filename": coverImageFilename,
                "saveFolder": curSeriesFolder,
            }
            self.downloadFile(imageFileInfo)


    def getSeriesAudioPackages(self, paramDict):
        urlEndpoint = "/Reading.svc/queryEnglishSeriesAudio/%s/%s/%s/%s/%s/%s" % \
            (gUserId, paramDict["seriesId"], paramDict["level"], paramDict["fixParam1"], paramDict["offset"], paramDict["limit"])
        print("urlEndpoint=%s" % urlEndpoint)
        fullUrl = "%s%s" % (gServerPort, urlEndpoint)
        # http://www.xiaohuasheng.cn:83/Reading.svc/queryEnglishSeriesAudio/1134723/31/-1/1/0/10


        """
        http://www.xiaohuasheng.cn:83/Reading.svc/getLevelForQueryEnglishSeriesAudio/1134723/31
        return english series level:
        [
            {
                "pk": -1,
                "name": "全部"
            },
            {
                "pk": 79,
                "name": "Level 1"
            },
            {
                "pk": 80,
                "name": "Level 2"
            },
            {
                "pk": 81,
                "name": "Level 3"
            }
        ]
        """


        curHeaders = self.generateCurrentHeaders(urlEndpoint)


        self.crawl(fullUrl,
            method="GET",
            callback=self.getSeriesAudioPackagesCallback,
            headers=curHeaders,
            save=paramDict,
        )


    def saveSeriesAudioPackagesInfo(self, seriesAudioPackagesInfo):
        print("saveSeriesAudioPackagesInfo: seriesAudioPackagesInfo=%s" % seriesAudioPackagesInfo)
        seriesId = seriesAudioPackagesInfo["seriesId"]
        curAudioPackagesFolder = getSeriesAudioPackagesFolder(seriesId)
        print("curAudioPackagesFolder=%s" % curAudioPackagesFolder)
        if not os.path.exists(curAudioPackagesFolder):
            createFolder(curAudioPackagesFolder)


        audioPackagesFilename = "series_%s_AudioPackages_info.json" % seriesId
        print("audioPackagesFilename=%s" % audioPackagesFilename)
        audioPackagesFullPath = os.path.abspath(os.path.join(curAudioPackagesFolder, audioPackagesFilename))
        if os.path.exists(audioPackagesFullPath):
            print("alreay existed %s" % audioPackagesFullPath)
            # append
            prevAudioPackagesInfo = loadJsonFromFile(audioPackagesFullPath)
            prevSeriesId = prevAudioPackagesInfo["seriesId"]
            if prevSeriesId != seriesId:
                print("!!! Unexpected not same id for saving series audio info, old=%s, new=%s" % (prevSeriesId, seriesId))
            else:
                newAudioPackagesInfo = prevAudioPackagesInfo
                newAudioPackagesInfo["AudioPackages"].extend(seriesAudioPackagesInfo["AudioPackages"])
                saveJsonToFile(audioPackagesFullPath, newAudioPackagesInfo)
        else:
            print("not existed %s" % audioPackagesFullPath)
            # write
            saveJsonToFile(audioPackagesFullPath, seriesAudioPackagesInfo)


    def getSeriesAudioPackagesCallback(self, response):
        respUrl = response.url
        print("respUrl=%s" % respUrl)
        respJson = response.json
        print("respJson=%s" % respJson)
        respData = self.extractResponseData(respJson)
        print("respData=%s" % respData)


        if respData:
            prevParamDict = response.save
            curParamDict = prevParamDict
            curParamDict["offset"] += curParamDict["limit"]
            self.getSeriesAudioPackages(curParamDict)


            seriesAudioPackagesList = respData
            seriesId = curParamDict["seriesId"]
            seriesAudioPackagesInfo = {
                "seriesId": seriesId,
                "AudioPackages": seriesAudioPackagesList
            }
            self.saveSeriesAudioPackagesInfo(seriesAudioPackagesInfo)


            print("seriesAudioPackagesList=%s" % seriesAudioPackagesList)
            for eachAudioPackageDict in seriesAudioPackagesList:
                print("eachAudioPackageDict=%s" % eachAudioPackageDict)
                audioPackageId = eachAudioPackageDict["pk"]
                self.getAudioPackage(seriesId, audioPackageId)
        else:
            print("!!! %s return no more data: %s" % (response.url, respJson))


    def getAudioPackage(self, seriesId, audioPackageId):
        urlEndpoint = "/Reading.svc/viewAudioPackage/%s/%s/%s" % (gUserId, audioPackageId, gFixParam1)
        fullUrl = "%s%s" % (gServerPort, urlEndpoint)
        # http://www.xiaohuasheng.cn:83/Reading.svc/viewAudioPackage/1134723/1808/1
        print("urlEndpoint=%s, fullUrl=%s" % (urlEndpoint, fullUrl))
        curHeaders = self.generateCurrentHeaders(urlEndpoint)


        self.crawl(fullUrl,
            method="GET",
            callback=self.getAudioPackageCallback,
            headers=curHeaders,
            save=seriesId
        )


    def getAudioPackageCallback(self, response):
        seriesId = response.save
        print("seriesId=%s" % seriesId)
        respUrl = response.url
        print("respUrl=%s" % respUrl)
        respJson = response.json
        print("respJson=%s" % respJson)
        respData = self.extractResponseData(respJson)
        print("respData=%s" % respData)


        audioPackageDict = respData[0]
        print("audioPackageDict=%s" % audioPackageDict)
        self.saveSingleAudioPackageInfo(seriesId, audioPackageDict)


        audioArrayJsonStr = audioPackageDict["audioArrayJson"]
        print("audioArrayJsonStr=%s" % audioArrayJsonStr)


        audioPackageId = audioPackageDict["pk"]
        # audioArrayDictList = json.loads(audioArrayJsonStr)
        audioArrayDictList = audioArrayJsonStr
        print("audioArrayDictList=%s" % audioArrayDictList)
        for singleAudioDict in audioArrayDictList:
            print("singleAudioDict=%s" % singleAudioDict)
            singleAudioDict["seriesId"] = seriesId
            singleAudioDict["audioPackageId"] = audioPackageId
            self.saveSingleAudio(singleAudioDict)


    def saveSingleAudioPackageInfo(self, seriesId, audioPackageInfo):
        audioPackageId = audioPackageInfo["pk"]
        curSingleAudioPackageFolder = getSingleAudioPackageFolder(seriesId, audioPackageId)
        print("curSingleAudioPackageFolder=%s" % curSingleAudioPackageFolder)
        createFolder(curSingleAudioPackageFolder)


        filenamePrefix = "series_%s_AudioPackages_%s" % (seriesId, audioPackageId)
        singleAudioPackageFilename = "%s_info.json" % (filenamePrefix)
        singleAudioPackageFullPath = os.path.abspath(os.path.join(curSingleAudioPackageFolder, singleAudioPackageFilename))
        audioPackageInfo = self.dictValueStrToJson(audioPackageInfo)
        saveJsonToFile(singleAudioPackageFullPath, audioPackageInfo)


        # download bookSeriesPicture
        # case 1:
        #   "bookSeriesPicture": "EnglishLevelFrontCoverOrInnerPage/79/封面.jpg",
        # coverImageUrlTail = audioPackageInfo["bookSeriesPicture"]
        # case 2:
        #   "picture": "attached/image/20190114/20190114103636_2075.jpg",
        #   "bookSeriesPicture": "",
        coverImageUrlTail = audioPackageInfo["picture"]
        print("coverImageUrlTail=%s" % coverImageUrlTail)


        if coverImageUrlTail:
            imageSuffix = coverImageUrlTail.split(".")[-1]
            imageFileInfo = {
                "fileUrl": gResourcesRoot + "/" + coverImageUrlTail,
                "filename": "%s_coverImage.%s" % (filenamePrefix, imageSuffix),
                "saveFolder": curSingleAudioPackageFolder,
            }
            self.downloadFile(imageFileInfo)


    def saveSingleAudio(self, singleAudioDict):
        seriesId = singleAudioDict["seriesId"]
        audioPackageId = singleAudioDict["audioPackageId"]
        audioId = singleAudioDict["pk"]


        curSingleAudioFolder = getSingleAudioFolder(seriesId, audioPackageId, audioId)
        print("curSingleAudioFolder=%s" % curSingleAudioFolder)
        createFolder(curSingleAudioFolder)


        filenamePrefix = "series_%s_AudioPackages_%s_audio_%s" % (seriesId, audioPackageId, audioId)
        singleAudioFilename = "%s_info.json" % (filenamePrefix)
        singleAudioFullPath = os.path.abspath(os.path.join(curSingleAudioFolder, singleAudioFilename))
        saveJsonToFile(singleAudioFullPath, singleAudioDict)


        """
        {
            "pk": 6497,
            "picture": "EnglishLevelFrontCoverOrInnerPage/79/封面.jpg",
            "path": "Audio/1808/20180911222508831.mp3",
            "extension": ".mp3",
            "title": "1. Bear Hugs-Listen and Repeat",
            "size": 1735488,
            "duration": 433,
            "sizeString": "1.7M",
            "durationString": "07:13",
            "packageName": "Bear Hugs ",
            "seriesId": 31,
            "audioPackageId": 1808
        }
        """


        # download audio file
        #   "path": "Audio/1808/20180911222516379.mp3",
        audioFileUrlTail = singleAudioDict["path"]
        print("audioFileUrlTail=%s" % audioFileUrlTail)
        if audioFileUrlTail:
            audioFileInfo = {
                "fileUrl": gResourcesRoot + "/" + audioFileUrlTail,
                "filename": ("Aduios_%s_" % audioId) + audioFileUrlTail.replace("/", "_"),
                "saveFolder": curSingleAudioFolder,
            }
            self.downloadFile(audioFileInfo)
本地调试时,可以下载到需要的各种文件。
然后去批量运行,结果报错:
【已解决】PySpider运行批量下载时报错:HTTP 599 Operation timed out after milliseconds with out of bytes received

转载请注明:在路上 » 【已解决】PySpider中用Python代码爬取小花生app中绘本数据

发表我的评论
取消评论

表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址
89 queries in 0.197 seconds, using 20.30MB memory