// valid charset:"GB18030"/"UTF-8", invliad:"UTF8"
public string getUrlRespHtml(string url,
Dictionary<string, string> headerDict = defHeaderDict,
string charset = defCharset,
Dictionary<string, string> postDict = defPostDict,
int timeout = defTimeout,
string postDataStr = defPostDataStr,
int readWriteTimeout = defReadWriteTimeout)
{
string respHtml = "";
HttpWebResponse resp = getUrlResponse(url, headerDict, postDict, timeout, postDataStr, readWriteTimeout);
//long realRespLen = resp.ContentLength;
if (resp != null)
{
StreamReader sr;
Stream respStream = resp.GetResponseStream();
if (!string.IsNullOrEmpty(charset))
{
Encoding htmlEncoding = Encoding.GetEncoding(charset);
sr = new StreamReader(respStream, htmlEncoding);
}
else
{
sr = new StreamReader(respStream);
}
try
{
respHtml = sr.ReadToEnd();
//while (!sr.EndOfStream)
//{
// respHtml = respHtml + sr.ReadLine();
//}
//string curLine = "";
//while ((curLine = sr.ReadLine()) != null)
//{
// respHtml = respHtml + curLine;
//}
////http://msdn.microsoft.com/zh-cn/library/system.io.streamreader.peek.aspx
//while (sr.Peek() > -1) //while not error or not reach end of stream
//{
// respHtml = respHtml + sr.ReadLine();
//}
//respStream.Close();
//sr.Close();
//resp.Close();
}
catch (Exception ex)
{
//【未解决】C#中StreamReader中遇到异常:未处理ObjectDisposedException,无法访问已关闭的流
//http://www.crifan.com/csharp_streamreader_unhandled_exception_objectdisposedexception_cannot_access_closed_stream
//System.ObjectDisposedException
respHtml = "";
}
finally
{
if (respStream != null)
{
respStream.Close();
}
if (sr != null)
{
sr.Close();
}
if (resp != null)
{
resp.Close();
}
}
}
return respHtml;
}
很明显可以看出,此处的getUrlRespHtml的很多参数,和前面介绍的第 9.5 节 “获得Url地址的响应:getUrlResponse”非常类似。
此处,针对于getUrlRespHtml的参数,也要再解释一下:
其他参数,包括url,headerDict,postDict,timeout,postDataStr,readWriteTimeout,都和getUrlResponse的参数含义相同。所以不再赘述。
另外还有参数,需要解释一下:
charset表示返回的网页内容,用何种字符编码去解码。
charset默认值是defCharset
defCharset的值是:
private const string defCharset = null;
此处,之所以defCharset的值,不是我们所常见的GBK,UTF-8等等,是因为此处是为了支持,当不设置charset时,就不去尝试用某种编码去解码通过StreamReader所读取出来的内容。
这样的就可以获得,原始的,返回的html,可以供有需要的人,后期进行自己的处理,比如自己去解码等等。
getUrlRespHtml内部,已经实现了足够多的,相对比较复杂的功能,对此,需要详细解释一下:
getUrlRespHtml内部调用getUrlResponse,内部已经加上了对应的User-Agent了。
当然默认使用的是IE8的User-Agent,相关部分的代码是:
//IE7
const string constUserAgent_IE7_x64 = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)";
//IE8
const string constUserAgent_IE8_x64 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E";
//IE9
const string constUserAgent_IE9_x64 = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"; // x64
const string constUserAgent_IE9_x86 = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"; // x86
//Chrome
const string constUserAgent_Chrome = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4";
//Mozilla Firefox
const string constUserAgent_Firefox = "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6";
private string gUserAgent;
gUserAgent = constUserAgent_IE8_x64;
req.UserAgent = gUserAgent;
所以,不会出现,被服务器当做普通的机器人或蜘蛛爬虫的情况。
内部相关代码:
req.AllowAutoRedirect = true;
默认是启用了自动跳转的。
如果想要禁止自动跳转,可以去给headerDict中加上对应的"AllowAutoRedirect"为"false"的参数
更多使用实例,详见后续的例子。
内部相关代码:
req.Headers["Accept-Encoding"] = "gzip, deflate";
//req.AutomaticDecompression = DecompressionMethods.GZip;
req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
即前面所解释的参数:第 9.5.1.4 节 “getUrlResponse的参数:timeout”,指的是网络方面的超时,和GetResponse和GetRequestStream有关
内部相关部分的代码是:
if (timeout > 0)
{
req.Timeout = timeout;
}
即前面所解释的参数:第 9.5.1.6 节 “getUrlResponse的参数:readWriteTimeout”,指的是StreamReader或StreamWriter的读写超时,和readLine之类的有关。
内部相关部分的代码是:
if (readWriteTimeout > 0)
{
//default ReadWriteTimeout is 300000=300 seconds = 5 minutes !!!
//too long, so here change to 300000 = 30 seconds
//for support TimeOut for later StreamReader's ReadToEnd
req.ReadWriteTimeout = readWriteTimeout;
}
此处已经支持,getUrlRespHtml内部,自动处理cookie。
内部相关部分的代码是:
CookieCollection curCookies = null;
curCookies = new CookieCollection();
if (curCookies != null)
{
req.CookieContainer = new CookieContainer();
req.CookieContainer.PerDomainCapacity = 40; // following will exceed max default 20 cookie per domain
req.CookieContainer.Add(curCookies);
}
resp = (HttpWebResponse)req.GetResponse();
updateLocalCookies(resp.Cookies, ref curCookies);
其中,注意到,设置了最大支持40个cookie,是因为,之前折腾InsertSkydriveFiles期间,遇到相对极端的情况:cookie超过默认的20多个,一个CookieContainer都装不下了,所以才改为40个,以便支持如此多的cookie。
getUrlRespHtml的参数够多,用法,也有很多种。
此处,就来通过例子来说明,如何使用此getUrlResponse函数。
getUrlRespHtml最常用,也是最简单的用法,就是:直接传入url,然后获得返回的html
代码如下:
例 9.7. getUrlRespHtml用法示例:只传入url而获得html
string mainJsUrl = "http://image.songtaste.com/inc/main.js";
string respHtmlMainJs = getUrlRespHtml(mainJsUrl);
其中,getUrlRespHtml内部,会自动帮你处理各种细节,比如cookie,header中的User-Agent等等内容,而你就直接可以得到对应返回的html了。
很多时候,在折腾抓取网页和模拟登陆时,都会遇到,需要额外再指定一些header,用于实现一些特定的目的。
比如,添加对应的Referer,以便成功模拟网页逻辑,获得所需返回的内容的:
string tmpRespHtml = "";
Dictionary<string, string> headerDict;
//(1)to get cookies
string pageRankMainUrl = "http://pagerank.webmasterhome.cn/";
tmpRespHtml = getUrlRespHtml(pageRankMainUrl);
//(2)ask page rank
string firstBaseUrl = "http://pagerank.webmasterhome.cn/?domain=";
//http://pagerank.webmasterhome.cn/?domain=answers.yahoo.com
string firstWholeUrl = firstBaseUrl + noHttpPreDomainUrl;
headerDict = new Dictionary<string, string>();
headerDict.Add("referer", pageRankMainUrl);
tmpRespHtml = getUrlRespHtml(firstWholeUrl, headerDict: headerDict);
如第 9.6.2.2 节 “默认是允许自动跳转的”所述,默认是启用了自动跳转的,想要禁止自动跳转,可以通过header去设置:
Dictionary<string, string> headerDict = new Dictionary<string, string>();
headerDict.Add("AllowAutoRedirect", "false");
string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
此处默认的Accept是"*/*",如果想要指定不同的类型,可以手动通过header去设置:
Dictionary<string, string> headerDict = new Dictionary<string, string>();
headerDict.Add("Accept", "text/html");
string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
关于Accept更多可能的取值,自己参考官网的解释:14.1 Accept
此处默认的KeepAlive是true的,如果不想继续保持连接,则可以通过header去禁止:
Dictionary<string, string> headerDict = new Dictionary<string, string>();
headerDict.Add("Keep-Alive", "false");
string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
此处默认没有指定Accept-Language,有需要的话,可以去通过header设置:
Dictionary<string, string> headerDict = new Dictionary<string, string>();
headerDict.Add("Accept-Language", "en-US"); //"zh-CN"
string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
关于Accept-Language更多可能的取值,自己参考官网的解释:14.4 Accept-Language
如第 9.6.2.1 节 “内部已默认指定了IE8的User-Agent”所述,我此处的getUrlRespHtml,默认添加的User-Agent是IE8的。
如果有需要,你可以自己换成别的,比如Firefox的User-Agent:
//Mozilla Firefox
const string constUserAgent_Firefox = "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6";
Dictionary<string, string> headerDict = new Dictionary<string, string>();
headerDict.Add("User-Agent", constUserAgent_Firefox);
string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
其中,关于各种浏览器的User-Agent,你可以自己去网络上找到。也可以参考我代码中的值:
//IE7
const string constUserAgent_IE7_x64 = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)";
//IE8
const string constUserAgent_IE8_x64 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E";
//IE9
const string constUserAgent_IE9_x64 = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"; // x64
const string constUserAgent_IE9_x86 = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"; // x86
//Chrome
const string constUserAgent_Chrome = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4";
//Mozilla Firefox
const string constUserAgent_Firefox = "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6";
此处默认情况下,对于GET,没有指定ContentType,对于POST,已经指定了"application/x-www-form-urlencoded"。
如果你有别的特殊需求,需要设置ContentType的话,可以去通过header设置:
Dictionary<string, string> headerDict = new Dictionary<string, string>();
headerDict.Add("Content-Type", "text/plain");
string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
关于Content-Type更多可能的取值,自己参考官网的解释:14.17 Content-Type
在很多时候,都需要设置,某些其他的,非标准的,header信息,则也可以去通过header设置。
比如,之前折腾InsertSkydriveFiles时所用到的:
string createFolerUrl = "https://skydrive.live.com/API/2/AddFolder?lct=1";
Dictionary<string, string> headerDict = new Dictionary<string, string>();
headerDict.Add("Accept", "application/json");
headerDict.Add("Referer", constSkydriveUrl);
headerDict.Add("Canary", gCanary);
headerDict.Add("Appid", gAppid);
headerDict.Add("X-Requested-With", "XMLHttpRequest");
headerDict.Add("Cache-Control", "no-cache");
string postDataStr = genCreateFolderPostData(folderName, parentId, cid);
respJson = getUrlRespHtml(createFolerUrl, headerDict:headerDict, postDataStr:postDataStr);
有时候,已经网页是某种编码的,所以为了正确解析返回的html,需要指定对应的字符编码charset:
string songtasteUserUrl = "http://www.songtaste.com/user/351979/";
string songtasteHtmlCharset = "GB18030";
string respHtmlUnicode = getUrlRespHtml(songtasteUserUrl, charset:songtasteHtmlCharset);
即可返回对应的,已经解码后的,Unicode字符串了。
如果你觉得默认的网络超时时间30秒不合适,可以自己另外指定,比如:
int timeoutInMilliSec = 10 * 1000;
string respHtml = getUrlRespHtml(someUrl, timeout:timeoutInMilliSec);
如果你觉得默认的Stream的读写超时时间30秒不合适,可以自己另外指定,比如:
int streamRdWrTimeout = 20 * 1000;
string respHtml = getUrlRespHtml(someUrl, readWriteTimeout:streamRdWrTimeout);
在模拟登陆时,往往会用到POST,会传递对应的POST数据
此处,主要有两种方式传递POST数据:
一般都是通过postDict传递数据进去
然后内部通过quoteParas转换为对应的post data,是以"&"为分隔符的。
个别情况下,特殊的情况下,会用到此postDataStr
其传递的post数据,是以换行为分隔符的。此时需要,不设置postDict(默认为null),然后设置对应的postDataStr即可。
下面,针对两种情况,都给出对应的多个示例来说明如何使用:
比如,之前折腾:第 9.11 节 “查找获得域名的Page Rank:getDomainPageRank”时所用到的:
//Method 1: use http://www.pagerankme.com/
queryUrl = "http://www.pagerankme.com/";
postDict = new Dictionary<string, string>();
postDict.Add("url", domainUrl);
respHtml = getUrlRespHtml(queryUrl, postDict: postDict);
比如,之前折腾:DownloadSongtasteMusic时所用到的:
const string stHtmlCharset = "GB18030";
Dictionary<string, string> headerDict = new Dictionary<string, string>();
headerDict.Add("x-requested-with", "XMLHttpRequest");
// when click play
// access http://songtaste.com/time.php, post data:
//str=5bf271ccad05f95186be764f725e9aaf07e0c7791a89123a9addb2a239179e64c91834c698a9c5d82f1ced3fe51ffc51&sid=3015123&t=0
Dictionary<string, string> postDict = new Dictionary<string, string>();
postDict.Add("str", str);
postDict.Add("sid", sid);
postDict.Add("t", "0");
string getRealAddrUrl = "http://songtaste.com/time.php";
songInfo.realAddr = crl.getUrlRespHtml(getRealAddrUrl, headerDict:headerDict, postDict:postDict, charset:stHtmlCharset);
比如,之前折腾:【未解决】通过百度API上传单个文件出现403的错误时所遇到的就是,post数据是以换行符非分隔符的,所以就要去直接设置对应的postDataStr:
string[] token = respTokenJson.Split(',');
string tokenStr = token[2].Split(':')[1].Trim('"');
byte[] fileBytes = null;
string filename = "fileForUpload2.txt";
string fullFilePath = @"d:\" + filename;
using (FileStream fs = new FileStream(fullFilePath, FileMode.Open))
{
fileBytes = new byte[fs.Length];
fs.Read(fileBytes, 0, fileBytes.Length);
}
StringBuilder buffer = new StringBuilder();
char[] fileCh = new char[fileBytes.Length];
for (int i = 0; i < fileBytes.Length; i++)
fileCh[i] = (char)fileBytes[i];
buffer.Append(fileCh);
//postDict = new Dictionary<string, string>();
//postDict.Add("file", buffer.ToString());
string postDataStr = buffer.ToString();
string uploadSingleFileUrl = "https://pcs.baidu.com/rest/2.0/pcs/file?";
Dictionary<string, string> queryParaDict = new Dictionary<string, string>();
queryParaDict.Add("method", "upload");
queryParaDict.Add("access_token", tokenStr);
queryParaDict.Add("path", "/apps/测试应用/" + filename);
uploadSingleFileUrl += crifanLib.quoteParas(queryParaDict);
curCookies = crifanLib.getCurCookies();
newCookies = new CookieCollection();
foreach (Cookie ck in curCookies)
{
if (ck.Name == "BAIDUID" || ck.Name == "BDUSS")
{
ck.Domain = "pcs.baidu.com";
}
newCookies.Add(ck);
}
crifanLib.setCurCookies(newCookies);
string boundaryValue = "----WebKitFormBoundaryS0JIa4uHF7yHd8xJ";
string boundaryExpression = "boundary=" + boundaryValue;
headerDict = new Dictionary<string, string>();
headerDict.Add("Pragma", "no-cache");
headerDict.Add("Content-Type", "multipart/form-data;" + " " + boundaryExpression);
postDataStr = boundaryValue + "\r\n"
+ "Content-Disposition: form-data; name=\"file\"" + "\r\n"
+ postDataStr + "\r\n"
+ boundaryValue;
//string str = crifanLib.getUrlRespHtml(
// string.Format(@"https://pcs.baidu.com/rest/2.0/pcs/file?method=upload&path=%2Fapps%2F%E6%B5%8B%E8%AF%95%E5%BA%94%E7%94%A8%2F78.jpg&access_token={0}", tokenStr),
// headerDict, postDict);
string respJson = crifanLib.getUrlRespHtml(uploadSingleFileUrl, headerDict:headerDict, postDataStr: postDataStr);
比如,之前折腾:【记录】给BlogsToWordPress添加支持导出网易的心情随笔时所遇到的就是,post数据是以换行符非分隔符的,所以就要去直接设置对应的postDataStr:
string postDataStr =
"callCount=1" + "\r\n" +
"scriptSessionId=${scriptSessionId}187" + "\r\n" +
"c0-scriptName=BlogBeanNew" + "\r\n" +
"c0-methodName=getBlogs" + "\r\n" +
"c0-id=0" + "\r\n" +
"c0-param0=" + "number:" + userId + "\r\n" +
"c0-param1=" + "number:" + startBlogIdx + "\r\n" +
"c0-param2=" + "number:" + onceGetNum;
//http://api.blog.163.com/ni_chen/dwr/call/plaincall/BlogBeanNew.getBlogs.dwr
string getBlogsDwrMainUrl = blogApi163 + "/" + blogUser + "/" + "dwr/call/plaincall/BlogBeanNew.getBlogs.dwr";
Dictionary<string, string> headerDict = new Dictionary<string, string>();
headerDict = new Dictionary<string, string>();
//Referer http://api.blog.163.com/crossdomain.html?t=20100205
headerDict.Add("Referer", "http://api.blog.163.com/crossdomain.html?t=20100205");
headerDict.Add("Content-Type", "text/plain");
string blogsRespHtml = getUrlRespHtml(getBlogsDwrMainUrl, headerDict:headerDict, postDataStr:postDataStr);