折腾:
【未解决】如何破解大众点评网页爬取时的反扒验证verify.meituan.com
期间,想到一个,觉得从逻辑上很好,很完美的办法:
对于每次下载,3000多个请求,但是只下载到1500左右的数据。
其中失败了的,被反扒跳转到验证页面的请求,也有1000多条。
理论上可以:
继续写代码,把这些url作为入口,然后一点点继续爬取。
这样即使还会出错,但是重复此过程,就能实现完整爬取数据的url,获取到所有的数据了。
所以就去更新代码:
def realStart(self):
# when some crawl found failed result, then can use this to crwal those failed
self.continueCrawlFailedResult()
def continueCrawlFailedResult(self):
# generate dict[city_english_name]=city info dict
mainCityList = loadJsonFromFile(constMainCityFullPath) # # main city with merged level
cityEnNameToInfoDict = {}
for eachMainCity in mainCityList:
# print("eachMainCity=%s" % eachMainCity)
cityEnName = eachMainCity["cityEnName"]
cityEnNameToInfoDict[cityEnName] = eachMainCity
failedResult = loadJsonFromFile(constContinueCrawlFailedFullPath)
totalFailedCount = failedResult["totalFailedCount"]
print("totalFailedCount=%s" % totalFailedCount)
failedInfoDictList = failedResult["failedList"]
"""
{
"totalFailedCount": 47,
"failedList": [
{
"failedUrl": "http://www.dianping.com/shihezi/ch70/g27762",
"errorMessage": "ERROR: redirected to verify url: https://verify.meituan.com/v2/web/general_page?action=spiderindefence&requestCode=ef3f13d1a17341cfa4451cd5dcabcafa&platform=1000&adaptor=auto&succCallbackUrl=https%3A%2F%2Foptimus-mtsi.meituan.com%2Foptimus%2FverifyResult%3ForiginUrl%3Dhttp%253A%252F%252Fwww.dianping.com%252Fshihezi%252Fch70%252Fg27762&theme=dianping"
},
{
"failedUrl": "http://www.dianping.com/wulumuqi/ch70/g27762p3",
"errorMessage": "ERROR: redirected to verify url: https://verify.meituan.com/v2/web/general_page?action=spiderindefence&requestCode=ab30a198169c49cf8457e34d21f1cbd1&platform=1000&adaptor=auto&succCallbackUrl=https%3A%2F%2Foptimus-mtsi.meituan.com%2Foptimus%2FverifyResult%3ForiginUrl%3Dhttp%253A%252F%252Fwww.dianping.com%252Fwulumuqi%252Fch70%252Fg27762p3&theme=dianping"
},
{
"failedUrl": "http://www.dianping.com/shop/12305862",
"errorMessage": "ERROR: redirected to verify url: https://verify.meituan.com/v2/web/general_page?action=spiderindefence&requestCode=96d1229aeb41441e80cc19b1be564fa1&platform=1000&adaptor=auto&succCallbackUrl=https%3A%2F%2Foptimus-mtsi.meituan.com%2Foptimus%2FverifyResult%3ForiginUrl%3Dhttp%253A%252F%252Fwww.dianping.com%252Fshop%252F12305862&theme=dianping"
},
...
"""
for eachFailedInfoDict in failedInfoDictList:
print("eachFailedInfoDict=%s" % eachFailedInfoDict)
failedUrl = eachFailedInfoDict["failedUrl"]
curInfo = eachFailedInfoDict["curInfo"]
# http://www.dianping.com/suzhou/ch70/g27762#20180712_154134_660436
# http://www.dianping.com/suzhou/ch70/g27762
foundCityEntryUrl = re.search("http://www\.dianping\.com/(?P<cityEnName>\w+)/ch70/g27762(#\w+)?$", failedUrl)
foundCitySubPageUrl = re.search("http://www\.dianping\.com/(?P<cityEnName>\w+)/ch70/g27762p(?P<curPageNumber>\d+)$", failedUrl)
foundSingleShopUrl = re.search("http://www.dianping.com/shop/(?P<shopIdStr>\d+)", failedUrl)
print("foundCityEntryUrl=%s" % foundCityEntryUrl)
print("foundCitySubPageUrl=%s" % foundCitySubPageUrl)
print("foundSingleShopUrl=%s" % foundSingleShopUrl)
if foundCityEntryUrl:
cityEnName = foundCityEntryUrl.group("cityEnName")
if cityEnName in cityEnNameToInfoDict.keys():
curMainCityDict = cityEnNameToInfoDict[cityEnName]
self.crawlCityEntry(failedUrl, curMainCityDict)
elif foundCitySubPageUrl:
# cityEnName = foundCitySubPageUrl.group("cityEnName")
# curPageNumber = foundCitySubPageUrl.group("curPageNumber")
# if cityEnName in cityEnNameToInfoDict.keys():
# curMainCityDict = cityEnNameToInfoDict[cityEnName]
# cityEntryUrl = "http://www\.dianping\.com/%s/ch70/g27762" % cityEnName
# self.crawlCitySubPage(curMainCityDict, curPageNumber, cityEntryUrl)
self.crawlCitySubPage(failedUrl, curInfo)
elif foundSingleShopUrl:
self.crawlSingleShop(failedUrl, curInfo)
def crawlCityEntry(self, cityEntryUrl, mainCityDict):
self.crawl(
cityEntryUrl,
callback=self.childEnglishEntryCallback,
headers=self.genCurHeaders(),
cookies={},
save=mainCityDict,
)
# def crawlCitySubPage(self, mainCityDict, curPageNumber, cityEntryUrl):
def crawlCitySubPage(self, citySubPageUrl, curInfo):
# if curPageNumber > 0:
# curPageUrl = "%sp%s" % (cityEntryUrl, curPageNumber)
# # http://www.dianping.com/suzhou/ch70/g27762p1
# else:
# curPageUrl = cityEntryUrl
# # http://www.dianping.com/suzhou/ch70/g27762
# # http://www.dianping.com/suzhou/ch70/g27762#20180712_154134_660436
# print("curPageUrl=%s" % curPageUrl)
# curInfo = {
# "curPageNumber": curPageNumber,
# "curMainCity": mainCityDict,
# }
self.crawl(
# curPageUrl,
citySubPageUrl,
callback=self.childEnglishSinglePageCallback,
headers=self.genCurHeaders(),
cookies={},
save=curInfo,
)
def crawlSingleShop(self, shopUrl, curInfo):
# shopUrl=http://www.dianping.com/shop/13741424
self.crawl(
shopUrl,
callback=self.shopDetailCallback,
headers=self.genCurHeaders(),
cookies={
# "_lxsdk_s": "16a4946bb7c-9d8-b5c-c47%7C%7C1", # chrome
# "_lxsdk_s": "16a494c1a16-58a-a82-c0e%7C%7C1", # safari
},
save=curInfo,
)然后去运行,结果:
对于失败了的1800多个url
继续爬取,竟然只发出600多个了url请求
然后其中下载到120多个url的数据。。
剩下1000多个的url,不知道去哪里了。。。
很是诡异。。
所以就放弃这个逻辑了。
后来突然想到:
难道是因为:
用的是公司的网络?
换用家里网络,或许就可以正常 继续下载之前失败的url了?
抽空再说吧。