【背景】
之前写的,用于抓取:
中帖子
(比如:
http://fiverr.com/bizgrowthcoach/provide-a-startup-checklist-and-project-plan
)的评论。
注:
此代码是之前该网站改版之前写的;
且是没有完成的;
只是贴出来,供参考而已->其中有些关于SgmlReader等函数的使用,可供参考;
【ScrapeFiverrComments代码分享】
1.截图:
2.项目代码下载:
ScrapeFiverrComments_2013-02-28_uncompleted.7z
3.代码分享:
(1)frmScrapeFiverrComments.cs
/*
* [File]
* frmScrapeFiverrComments.cs
*
* [Function]
* fiverr.com comments scrapper
*
* [Note]
*
* [Update]
* 2013-02-28
*
* [Author]
* Crifan Li
*
* [Contact]
* https://www.crifan.org/contact_me/
*
*/
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.Web;
using System.Xml;
using Sgml;
using System.IO;
using Excel = Microsoft.Office.Interop.Excel;
using Microsoft.Office.Interop.Excel;
/*
* icons:
*
* search/find
* http://www.easyicon.cn/icondetail/106/
*
* stop
* http://www.easyicon.cn/icondetail/568811/
*
* crawler
* http://www.easyicon.cn/icondetail/13685/
*
* login
* http://www.easyicon.cn/icondetail/500811/
*
* send mail
* http://www.easyicon.cn/icondetail/538560/
*/
namespace ScrapeFiverrComments
{
public partial class frmScrapeFiverrComments : Form
{
public crifanLib crifanLib;
static int constPageGigNumber = 40;
public frmScrapeFiverrComments()
{
AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(CurrentDomain_AssemblyResolve);
InitializeComponent();
crifanLib = new crifanLib();
}
System.Reflection.Assembly CurrentDomain_AssemblyResolve(object sender, ResolveEventArgs args)
{
string dllName = args.Name.Contains(",") ? args.Name.Substring(0, args.Name.IndexOf(',')) : args.Name.Replace(".dll", "");
dllName = dllName.Replace(".", "_");
if (dllName.EndsWith("_resources")) return null;
System.Resources.ResourceManager rm = new System.Resources.ResourceManager(GetType().Namespace + ".Properties.Resources", System.Reflection.Assembly.GetExecutingAssembly());
byte[] bytes = (byte[])rm.GetObject(dllName);
return System.Reflection.Assembly.Load(bytes);
}
private void initDataGridView()
{
dgvCmtAuthorList.ColumnCount = 2;
dgvCmtAuthorList.RowHeadersWidth = 80;
dgvCmtAuthorList.RowHeadersDefaultCellStyle.Alignment = DataGridViewContentAlignment.MiddleCenter;
dgvCmtAuthorList.RowHeadersWidthSizeMode = DataGridViewRowHeadersWidthSizeMode.DisableResizing;
dgvCmtAuthorList.AutoSizeColumnsMode = DataGridViewAutoSizeColumnsMode.Fill;
//(1)username
dgvCmtAuthorList.Columns[0].HeaderText = "Username";
dgvCmtAuthorList.Columns[0].Width = 160;
//(2)profile url
dgvCmtAuthorList.Columns[1].HeaderText = "Profile Url";
dgvCmtAuthorList.Columns[1].Width = grbCmtAuthorList.Width - dgvCmtAuthorList.RowHeadersWidth - dgvCmtAuthorList.Columns[0].Width - 20;
}
private void frmScrapeFiverrComments_Load(object sender, EventArgs e)
{
initDataGridView();
grbLogin.Enabled = false;
//txbMessageToSend.Enabled = false;
//btnSendMessage.Enabled = false;
}
XmlDocument htmlToXmlDoc(string html)
{
// setup SgmlReader
Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
sgmlReader.DocType = "HTML";
sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
//sgmlReader.InputStream = reader;
sgmlReader.InputStream = new StringReader(html);
// create document
XmlDocument doc = new XmlDocument();
doc.PreserveWhitespace = true;
doc.XmlResolver = null;
doc.Load(sgmlReader);
return doc;
}
//gig comment author info
public struct gigCmtAuthorInfo
{
public string username;
public string profileUrl;
};
private void btnScrape_Click(object sender, EventArgs e)
{
//http://fiverr.com/bizgrowthcoach/provide-a-startup-checklist-and-project-plan
string curGigUrl = txbGigUrl.Text;
bool isFirstPage = true;
bool needGetMorePage = true;
int curPageNumber = 0;
string gigId = "";
while (needGetMorePage)
{
string gigUrlRespHtml = "";
if (isFirstPage)
{
gigUrlRespHtml = crifanLib.getUrlRespHtml(curGigUrl);
}
else
{
//string gigUrlRespHtml = crifanLib.getUrlRespHtml(curGigUrl, headerDict);
}
XmlDocument xmlDoc = htmlToXmlDoc(gigUrlRespHtml);
XmlNamespaceManager m = new XmlNamespaceManager(xmlDoc.NameTable);
m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml");
//<li class="rating-block ">
// <div class="userimage">
// <img src="http://dfkno3dtzeq4c.cloudfront.net/assets/02-mini-2bb551afad6a7740ad73314482189dd7.gif" width="24px" height="24px" class="true" alt="azza1200" />
// </div>
// <div class= " rating-text">
// <div>
// <div class="rater-username">
// <a href="/azza1200" rel="nofollow">azza1200</a>
// <span class="time-ago titled" title="1361846351"></span>
// </div>
// <div class="comment-block">
// <div class="rating-icon">
// <img alt="thumb down - negative" src="http://dfkno3dtzeq4c.cloudfront.net/assets/thumb_down-9ff2828220cbb43e26ad5b4fa0b0fe88.png" />
// </div>
// <div class="rating-comment">
// Terrible value. Seller is arrogant and unprofessional as well. Advise Google-ing to get a better plan than this rubbish he is selling. Poor form
// </div>
// </div>
// </div>
// </div>
// <div class="clear"></div>
//</li>
XmlNodeList ratingBlockList = xmlDoc.SelectNodes("//w3org:li[@class='rating-block ']", m);
if (ratingBlockList != null)
{
if (ratingBlockList.Count < constPageGigNumber)
{
needGetMorePage = false;
}
foreach (XmlNode ratingBlockNode in ratingBlockList)
{
gigCmtAuthorInfo cmtAuthorInfo = new gigCmtAuthorInfo();
//1. user name
//2. profile url
//<div class="rater-username">
// <a href="/azza1200" rel="nofollow">azza1200</a>
// <span class="time-ago titled" title="1361846351"></span>
//</div>
XmlNode rateUsernameNode = ratingBlockNode.SelectSingleNode(".//w3org:div[@class='rater-username']", m);
string username = "";
string profileUrl = "";
if (rateUsernameNode != null)
{
XmlNode aNode = rateUsernameNode.SelectSingleNode(".//w3org:a[@rel|href]", m);
if (aNode != null)
{
username = aNode.InnerText;
string href = aNode.Attributes["href"].Value;
profileUrl = "http://fiverr.com" + href; //http://fiverr.com/azza1200
cmtAuthorInfo.username = username;
cmtAuthorInfo.profileUrl = profileUrl;
storeCommentAuthorInfo(cmtAuthorInfo);
//update UI
System.Windows.Forms.Application.DoEvents();
}
}//if (rateUsernameNode != null)
}//foreach (XmlNode ratingBlockNode in ratingBlockList)
//update for next page
if (isFirstPage)
{
isFirstPage = false;
curPageNumber = 1;
//<form accept-charset="UTF-8" action="http://fiverr.com/purchases?gig_id=748824" class="order-form" id="start_order_form_748824" method="post">
if (crifanLib.extractSingleStr(@"action=""http://fiverr\.com/purchases\?gig_id=(\d+)""", gigUrlRespHtml, out gigId))
{
}
}
else
{
curPageNumber++;
}
int offsetNumber = 40 * curPageNumber;
//http://fiverr.com/gigs/748824/load_ratings?offset=40&show_work_sample=false
string nextPageGigUrl = "http://fiverr.com/gigs/" + gigId + "/load_ratings?offset=" + offsetNumber.ToString() + "&show_work_sample=false";
//curGigUrl = nextPageGigUrl;
string titlePart = "";
if (crifanLib.extractSingleStr(@"http://fiverr\.com/\w+/([\w-]+)", curGigUrl, out titlePart))
{
//http://fiverr.com/gigs/provide-a-startup-checklist-and-project-plan?offset=40
nextPageGigUrl = "http://fiverr.com/gigs/" + titlePart + "?offset=" + offsetNumber.ToString();
curGigUrl = nextPageGigUrl;
}
}//if (ratingBlockList != null)
else
{
needGetMorePage = false;
}
}
}
private bool userNotExist(string username)
{
bool notExist = true;
for(int rowIdx = 0; rowIdx <= dgvCmtAuthorList.Rows.Count -1; rowIdx++)
{
string eachUsername = dgvCmtAuthorList.Rows[rowIdx].Cells[0].Value.ToString();
if (eachUsername.Equals(username))
{
notExist = false;
break;
}
}
return notExist;
}
void storeCommentAuthorInfo(gigCmtAuthorInfo cmtAuthorInfo)
{
if (userNotExist(cmtAuthorInfo.username))
{
dgvCmtAuthorList.Rows.Add(
cmtAuthorInfo.username,
cmtAuthorInfo.profileUrl);
dgvCmtAuthorList.Rows[dgvCmtAuthorList.Rows.Count - 1].Selected = true;
dgvCmtAuthorList.FirstDisplayedScrollingRowIndex = dgvCmtAuthorList.Rows.Count - 1;
for (int count = 0; (count <= (dgvCmtAuthorList.Rows.Count - 1)); count++)
{
dgvCmtAuthorList.Rows[count].HeaderCell.Value = String.Format("{0}", count + 1);
}
}
return;
}
private void btnSaveAll_Click(object sender, EventArgs e)
{
Excel.Application xlApp = new Excel.Application();
Excel.Workbook xlWorkBook;
Excel.Worksheet xlWorkSheet;
object misValue = System.Reflection.Missing.Value;
xlApp = new Excel.ApplicationClass();
xlWorkBook = xlApp.Workbooks.Add(misValue);
xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1);
int i = 0;
int j = 0;
//save header
for (i = 0; i <= dgvCmtAuthorList.ColumnCount - 1; i++)
{
xlWorkSheet.Cells[0 + 1, i + 1] = dgvCmtAuthorList.Columns[i].HeaderText;
}
//save cells
for (i = 0; i <= dgvCmtAuthorList.RowCount - 1; i++)
{
for (j = 0; j <= dgvCmtAuthorList.ColumnCount - 1; j++)
{
DataGridViewCell cell = dgvCmtAuthorList[j, i];
xlWorkSheet.Cells[i + 2, j + 1] = cell.Value;
}
}
//formatting
//header to bold
Range headerRow = xlWorkSheet.get_Range("1:1", System.Type.Missing);
headerRow.Font.Bold = true;
//auto adjust column width (according to content)
Range allColumn = xlWorkSheet.Columns;
allColumn.AutoFit();
string currentPath = System.Environment.CurrentDirectory;
string outputFilename = "ScrapedGigCommentsAuthorList.xls";
string fullFilename = Path.Combine(currentPath, outputFilename);
//xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, misValue, misValue, misValue, misValue, misValue);
xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, XlSaveConflictResolution.xlLocalSessionChanges, misValue, misValue, misValue, misValue);
xlWorkBook.Close(true, misValue, misValue);
xlApp.Quit();
releaseObject(xlWorkSheet);
releaseObject(xlWorkBook);
releaseObject(xlApp);
System.Diagnostics.Process.Start("Explorer.exe", "/select," + fullFilename);
}
private void releaseObject(object obj)
{
try
{
System.Runtime.InteropServices.Marshal.ReleaseComObject(obj);
obj = null;
}
catch (Exception ex)
{
obj = null;
MessageBox.Show("Exception Occured while releasing object " + ex.ToString());
}
finally
{
GC.Collect();
}
}
private void btnClearAll_Click(object sender, EventArgs e)
{
dgvCmtAuthorList.Rows.Clear();
}
private void btnLogin_Click(object sender, EventArgs e)
{
bool loginOk = loginFiverrCom(txbUsername.Text, txbPassword.Text);
if (loginOk)
{
txbMessageToSend.Enabled = true;
btnSendMessage.Enabled = true;
}
else
{
txbMessageToSend.Enabled = false;
btnSendMessage.Enabled = false;
}
}
private bool loginFiverrCom(string username, string password)
{
bool loginOk = false;
return loginOk;
}
}
}
【总结】
转载请注明:在路上 » 【代码分享】C#代码:ScrapeFiverrComments – 抓取fiverr.com中帖子的评论