折腾:
【未解决】如何破解大众点评网页爬取时的反扒验证verify.meituan.com
期间,想到一个,觉得从逻辑上很好,很完美的办法:
对于每次下载,3000多个请求,但是只下载到1500左右的数据。
其中失败了的,被反扒跳转到验证页面的请求,也有1000多条。
理论上可以:
继续写代码,把这些url作为入口,然后一点点继续爬取。
这样即使还会出错,但是重复此过程,就能实现完整爬取数据的url,获取到所有的数据了。
所以就去更新代码:
def realStart(self): # when some crawl found failed result, then can use this to crwal those failed self.continueCrawlFailedResult() def continueCrawlFailedResult(self): # generate dict[city_english_name]=city info dict mainCityList = loadJsonFromFile(constMainCityFullPath) # # main city with merged level cityEnNameToInfoDict = {} for eachMainCity in mainCityList: # print("eachMainCity=%s" % eachMainCity) cityEnName = eachMainCity["cityEnName"] cityEnNameToInfoDict[cityEnName] = eachMainCity failedResult = loadJsonFromFile(constContinueCrawlFailedFullPath) totalFailedCount = failedResult["totalFailedCount"] print("totalFailedCount=%s" % totalFailedCount) failedInfoDictList = failedResult["failedList"] """ { "totalFailedCount": 47, "failedList": [ { "failedUrl": "http://www.dianping.com/shihezi/ch70/g27762", "errorMessage": "ERROR: redirected to verify url: https://verify.meituan.com/v2/web/general_page?action=spiderindefence&requestCode=ef3f13d1a17341cfa4451cd5dcabcafa&platform=1000&adaptor=auto&succCallbackUrl=https%3A%2F%2Foptimus-mtsi.meituan.com%2Foptimus%2FverifyResult%3ForiginUrl%3Dhttp%253A%252F%252Fwww.dianping.com%252Fshihezi%252Fch70%252Fg27762&theme=dianping" }, { "failedUrl": "http://www.dianping.com/wulumuqi/ch70/g27762p3", "errorMessage": "ERROR: redirected to verify url: https://verify.meituan.com/v2/web/general_page?action=spiderindefence&requestCode=ab30a198169c49cf8457e34d21f1cbd1&platform=1000&adaptor=auto&succCallbackUrl=https%3A%2F%2Foptimus-mtsi.meituan.com%2Foptimus%2FverifyResult%3ForiginUrl%3Dhttp%253A%252F%252Fwww.dianping.com%252Fwulumuqi%252Fch70%252Fg27762p3&theme=dianping" }, { "failedUrl": "http://www.dianping.com/shop/12305862", "errorMessage": "ERROR: redirected to verify url: https://verify.meituan.com/v2/web/general_page?action=spiderindefence&requestCode=96d1229aeb41441e80cc19b1be564fa1&platform=1000&adaptor=auto&succCallbackUrl=https%3A%2F%2Foptimus-mtsi.meituan.com%2Foptimus%2FverifyResult%3ForiginUrl%3Dhttp%253A%252F%252Fwww.dianping.com%252Fshop%252F12305862&theme=dianping" }, ... """ for eachFailedInfoDict in failedInfoDictList: print("eachFailedInfoDict=%s" % eachFailedInfoDict) failedUrl = eachFailedInfoDict["failedUrl"] curInfo = eachFailedInfoDict["curInfo"] # http://www.dianping.com/suzhou/ch70/g27762#20180712_154134_660436 # http://www.dianping.com/suzhou/ch70/g27762 foundCityEntryUrl = re.search("http://www\.dianping\.com/(?P<cityEnName>\w+)/ch70/g27762(#\w+)?$", failedUrl) foundCitySubPageUrl = re.search("http://www\.dianping\.com/(?P<cityEnName>\w+)/ch70/g27762p(?P<curPageNumber>\d+)$", failedUrl) foundSingleShopUrl = re.search("http://www.dianping.com/shop/(?P<shopIdStr>\d+)", failedUrl) print("foundCityEntryUrl=%s" % foundCityEntryUrl) print("foundCitySubPageUrl=%s" % foundCitySubPageUrl) print("foundSingleShopUrl=%s" % foundSingleShopUrl) if foundCityEntryUrl: cityEnName = foundCityEntryUrl.group("cityEnName") if cityEnName in cityEnNameToInfoDict.keys(): curMainCityDict = cityEnNameToInfoDict[cityEnName] self.crawlCityEntry(failedUrl, curMainCityDict) elif foundCitySubPageUrl: # cityEnName = foundCitySubPageUrl.group("cityEnName") # curPageNumber = foundCitySubPageUrl.group("curPageNumber") # if cityEnName in cityEnNameToInfoDict.keys(): # curMainCityDict = cityEnNameToInfoDict[cityEnName] # cityEntryUrl = "http://www\.dianping\.com/%s/ch70/g27762" % cityEnName # self.crawlCitySubPage(curMainCityDict, curPageNumber, cityEntryUrl) self.crawlCitySubPage(failedUrl, curInfo) elif foundSingleShopUrl: self.crawlSingleShop(failedUrl, curInfo) def crawlCityEntry(self, cityEntryUrl, mainCityDict): self.crawl( cityEntryUrl, callback=self.childEnglishEntryCallback, headers=self.genCurHeaders(), cookies={}, save=mainCityDict, ) # def crawlCitySubPage(self, mainCityDict, curPageNumber, cityEntryUrl): def crawlCitySubPage(self, citySubPageUrl, curInfo): # if curPageNumber > 0: # curPageUrl = "%sp%s" % (cityEntryUrl, curPageNumber) # # http://www.dianping.com/suzhou/ch70/g27762p1 # else: # curPageUrl = cityEntryUrl # # http://www.dianping.com/suzhou/ch70/g27762 # # http://www.dianping.com/suzhou/ch70/g27762#20180712_154134_660436 # print("curPageUrl=%s" % curPageUrl) # curInfo = { # "curPageNumber": curPageNumber, # "curMainCity": mainCityDict, # } self.crawl( # curPageUrl, citySubPageUrl, callback=self.childEnglishSinglePageCallback, headers=self.genCurHeaders(), cookies={}, save=curInfo, ) def crawlSingleShop(self, shopUrl, curInfo): # shopUrl=http://www.dianping.com/shop/13741424 self.crawl( shopUrl, callback=self.shopDetailCallback, headers=self.genCurHeaders(), cookies={ # "_lxsdk_s": "16a4946bb7c-9d8-b5c-c47%7C%7C1", # chrome # "_lxsdk_s": "16a494c1a16-58a-a82-c0e%7C%7C1", # safari }, save=curInfo, )
然后去运行,结果:
对于失败了的1800多个url
继续爬取,竟然只发出600多个了url请求
然后其中下载到120多个url的数据。。
剩下1000多个的url,不知道去哪里了。。。
很是诡异。。
所以就放弃这个逻辑了。
后来突然想到:
难道是因为:
用的是公司的网络?
换用家里网络,或许就可以正常 继续下载之前失败的url了?
抽空再说吧。