#------------------------------------------------------------------------------
# check file validation:
# open file url to check return info is match or not
# with exception support
# note: should handle while the file url is redirect
# eg :
# http://publish.it168.com/2007/0627/images/500754.jpg ->
# http://img.publish.it168.com/2007/0627/images/500754.jpg
# other special one:
# sina pic url:
# http://s14.sinaimg.cn/middle/3d55a9b7g9522d474a84d&690
# http://s14.sinaimg.cn/orignal/3d55a9b7g9522d474a84d
# the real url is same with above url
def isFileValid(fileUrl) :
fileIsValid = False;
errReason = "Unknown error";
try :
#print "original fileUrl=",fileUrl;
origFileName = fileUrl.split('/')[-1];
#print "origFileName=",origFileName;
#old: https://ie2zeq.bay.livefilestore.com/y1mo7UWr-TrmqbBhkw52I0ii__WE6l2UtMRSTZHSky66-uDxnCdKPr3bdqVrpUcQHcoJLedlFXa43bvCp_O0zEGF3JdG_yZ4wRT-c2AQmJ_TNcWvVZIXfBDgGerouWyx19WpA4I0XQR1syRJXjDNpwAbQ/IMG_5214_thumb[1].jpg
#new: https://kxoqva.bay.livefilestore.com/y1mQlGjwNAYiHKoH5Aw6TMNhsCmX2YDR3vPKnP86snuqQEtnZgy3dHkwUvZ61Ah8zU3AGiS4whmm_ADrvxdufEAfMGo56KjLdhIbosn9F34olQ/IMG_5214_thumb%5b1%5d.jpg
unquotedOrigFilenname = urllib.unquote(origFileName);
#print "unquotedOrigFilenname=",unquotedOrigFilenname
lowUnquotedOrigFilename = unquotedOrigFilenname.lower();
#print "lowUnquotedOrigFilename=",lowUnquotedOrigFilename;
resp = urllib2.urlopen(fileUrl, timeout=gConst['defaultTimeout']); # note: Python 2.6 has added timeout support.
#print "resp=",resp;
realUrl = resp.geturl();
#print "realUrl=",realUrl;
newFilename = realUrl.split('/')[-1];
#print "newFilename=",newFilename;
#http://blog.sina.com.cn/s/blog_696e50390100ntxs.html
unquotedNewFilename = urllib.unquote(newFilename);
#print "unquotedNewFilename=",unquotedNewFilename;
unquotedLowNewFilename = unquotedNewFilename.lower();
#print "unquotedLowNewFilename=",unquotedLowNewFilename;
respInfo = resp.info();
#print "respInfo=",respInfo;
respCode = resp.getcode();
#print "respCode=",respCode;
# special:
# http://116.img.pp.sohu.com/images/blog/2007/5/24/17/24/11355bf42a9.jpg
# return no content-length
#contentLen = respInfo['Content-Length'];
# for redirect, if returned size>0 and filename is same, also should be considered valid
#if (origFileName == newFilename) and (contentLen > 0):
# for redirect, if returned response code is 200(OK) and filename is same, also should be considered valid
#if (origFileName == newFilename) and (respCode == 200):
if (lowUnquotedOrigFilename == unquotedLowNewFilename) and (respCode == 200):
fileIsValid = True;
else :
fileIsValid = False;
# eg: Content-Type= image/gif, ContentTypes : audio/mpeg
# more ContentTypes can refer: http://kenya.bokee.com/3200033.html
contentType = respInfo['Content-Type'];
errReason = "file url returned info: type=%s, len=%d, realUrl=%s"%(contentType, contentLen, realUrl);
except urllib2.URLError,reason :
fileIsValid = False;
errReason = reason;
except urllib2.HTTPError,code :
fileIsValid = False;
errReason = code;
except :
fileIsValid = False;
errReason = "Unknown error";
# here type(errReason)= <class 'urllib2.HTTPError'>, so just convert it to str
errReason = str(errReason);
return (fileIsValid, errReason);
#------------------------------------------------------------------------------
# download from fileUrl then save to fileToSave
# with exception support
# note: the caller should make sure the fileUrl is a valid internet resource/file
def downloadFile(fileUrl, fileToSave, needReport = False) :
isDownOK = False;
downloadingFile = '';
#---------------------------------------------------------------------------
# note: totalFileSize -> may be -1 on older FTP servers which do not return a file size in response to a retrieval request
def reportHook(copiedBlocks, blockSize, totalFileSize) :
#global downloadingFile
if copiedBlocks == 0 : # 1st call : once on establishment of the network connection
print 'Begin to download %s, total size=%d'%(downloadingFile, totalFileSize);
else : # rest call : once after each block read thereafter
print 'Downloaded bytes: %d' % ( blockSize * copiedBlocks);
return;
#---------------------------------------------------------------------------
try :
if fileUrl :
downloadingFile = fileUrl;
if needReport :
urllib.urlretrieve(fileUrl, fileToSave, reportHook);
else :
urllib.urlretrieve(fileUrl, fileToSave);
isDownOK = True;
else :
print "Input download file url is NULL";
except urllib.ContentTooShortError(msg) :
isDownOK = False;
except :
isDownOK = False;
return isDownOK;
例 2.21. downloadFile的使用范例
if dstPicFile and downloadFile(curUrl, dstPicFile) :
# replace old url with new url
#------------------------------------------------------------------------------
# manually download fileUrl then save to fileToSave
def manuallyDownloadFile(fileUrl, fileToSave) :
isDownOK = False;
downloadingFile = '';
try :
if fileUrl :
# 1. find real address
#print "fileUrl=",fileUrl;
resp = urllib2.urlopen(fileUrl, timeout=gConst['defaultTimeout']);
#print "resp=",resp;
realUrl = resp.geturl(); # not same with original file url if redirect
# if url is invalid, then add timeout can avoid dead
respHtml = getUrlRespHtml(realUrl, useGzip=False, timeout=gConst['defaultTimeout']);
isDownOK = saveBinDataToFile(respHtml, fileToSave);
else :
print "Input download file url is NULL";
except urllib.ContentTooShortError(msg) :
isDownOK = False;
except :
isDownOK = False;
return isDownOK;
例 2.22. manuallyDownloadFile的使用范例
#if dstPicFile and downloadFile(curUrl, dstPicFile) :
# urlretrieve in downloadFile is too slow while download QQ Space Picture
# so here use manuallyDownloadFile instead
if dstPicFile and manuallyDownloadFile(curUrl, dstPicFile) :
# replace old url with new url
#------------------------------------------------------------------------------
# get response from url
# note: if you have already used cookiejar, then here will automatically use it
# while using rllib2.Request
def getUrlResponse(url, postDict={}, headerDict={}, timeout=0, useGzip=False) :
# makesure url is string, not unicode, otherwise urllib2.urlopen will error
url = str(url);
if (postDict) :
postData = urllib.urlencode(postDict);
req = urllib2.Request(url, postData);
req.add_header('Content-Type', "application/x-www-form-urlencoded");
else :
req = urllib2.Request(url);
if(headerDict) :
#print "added header:",headerDict;
for key in headerDict.keys() :
req.add_header(key, headerDict[key]);
defHeaderDict = {
'User-Agent' : gConst['userAgentIE9'],
'Cache-Control' : 'no-cache',
'Accept' : '*/*',
'Connection' : 'Keep-Alive',
};
# add default headers firstly
for eachDefHd in defHeaderDict.keys() :
#print "add default header: %s=%s"%(eachDefHd,defHeaderDict[eachDefHd]);
req.add_header(eachDefHd, defHeaderDict[eachDefHd]);
if(useGzip) :
#print "use gzip for",url;
req.add_header('Accept-Encoding', 'gzip, deflate');
# add customized header later -> allow overwrite default header
if(headerDict) :
#print "added header:",headerDict;
for key in headerDict.keys() :
req.add_header(key, headerDict[key]);
if(timeout > 0) :
# set timeout value if necessary
resp = urllib2.urlopen(req, timeout=timeout);
else :
resp = urllib2.urlopen(req);
return resp;
例 2.23. getUrlResponse的使用范例
resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip);
respHtml = resp.read();
#------------------------------------------------------------------------------
# get response html==body from url
#def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=False) :
def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=True) :
resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip);
respHtml = resp.read();
if(useGzip) :
#print "---before unzip, len(respHtml)=",len(respHtml);
respInfo = resp.info();
# Server: nginx/1.0.8
# Date: Sun, 08 Apr 2012 12:30:35 GMT
# Content-Type: text/html
# Transfer-Encoding: chunked
# Connection: close
# Vary: Accept-Encoding
# ...
# Content-Encoding: gzip
# sometime, the request use gzip,deflate, but actually returned is un-gzip html
# -> response info not include above "Content-Encoding: gzip"
# eg: http://blog.sina.com.cn/s/comment_730793bf010144j7_3.html
# -> so here only decode when it is indeed is gziped data
if( ("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")) :
respHtml = zlib.decompress(respHtml, 16+zlib.MAX_WBITS);
#print "+++ after unzip, len(respHtml)=",len(respHtml);
return respHtml;
例 2.25. getUrlRespHtml的使用范例:带额外参数
modifyUrl = gVal['blogEntryUrl'] + "/blog/submit/modifyblog";
#logging.debug("Modify Url is %s", modifyUrl);
#http://hi.baidu.com/wwwhaseecom/blog/item/79188d1b4fa36f068718bf79.html
foundSpBlogID = re.search(r"blog/item/(?P<spBlogID>\w+?).html", url);
if(foundSpBlogID) :
spBlogID = foundSpBlogID.group("spBlogID");
logging.debug("Extracted spBlogID=%s", spBlogID);
else :
modifyOk = False;
errInfo = "Can't extract post spBlogID !";
return (modifyOk, errInfo);
newPostContentGb18030 = newPostContentUni.encode("GB18030");
categoryGb18030 = infoDict['category'].encode("GB18030");
titleGb18030 = infoDict['title'].encode("GB18030");
postDict = {
"bdstoken" : gVal['spToken'],
"ct" : "1",
"mms_flag" : "0",
"cm" : "2",
"spBlogID" : spBlogID,
"spBlogCatName_o": categoryGb18030, # old catagory
"edithid" : "",
"previewImg" : "",
"spBlogTitle" : titleGb18030,
"spBlogText" : newPostContentGb18030,
"spBlogCatName" : categoryGb18030, # new catagory
"spBlogPower" : "0",
"spIsCmtAllow" : "1",
"spShareNotAllow":"0",
"spVcode" : "",
"spVerifyKey" : "",
}
headerDict = {
# 如果不添加Referer,则返回的html则会出现错误:"数据添加的一般错误"
"Referer" : gVal['blogEntryUrl'] + "/blog/modify/" + spBlogID,
}
respHtml = getUrlRespHtml(modifyUrl, postDict, headerDict);
因为成功登录某网页后,一般都会有对应的cookie返回,所以常用此函数去判断是否成功登录某网页。
#------------------------------------------------------------------------------
# check all cookies in cookiesDict is exist in cookieJar or not
def checkAllCookiesExist(cookieNameList, cookieJar) :
cookiesDict = {};
for eachCookieName in cookieNameList :
cookiesDict[eachCookieName] = False;
allCookieFound = True;
for cookie in cookieJar :
if(cookie.name in cookiesDict) :
cookiesDict[cookie.name] = True;
for eachCookie in cookiesDict.keys() :
if(not cookiesDict[eachCookie]) :
allCookieFound = False;
break;
return allCookieFound;
例 2.26. checkAllCookiesExist的使用范例
#http://www.darlingtree.com/wordpress/archives/242
gVal['cj'] = cookielib.CookieJar();
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(gVal['cj']));
urllib2.install_opener(opener);
resp = urllib2.urlopen(baiduSpaceEntryUrl);
loginBaiduUrl = "https://passport.baidu.com/?login";
#username=%D0%C4%C7%E9%C6%DC%CF%A2%B5%D8&password=xxx&mem_pass=on
postDict = {
'username' : username,
'password' : password,
'mem_pass' : 'on',
};
resp = getUrlResponse(loginBaiduUrl, postDict);
# check whether the cookie is OK
cookieNameList = ["USERID", "PTOKEN", "STOKEN"];
loginOk = checkAllCookiesExist(cookieNameList, gVal['cj']);
if (not loginOk) :
logging.error("Login fail for not all expected cookies exist !");
return loginOk;