【背景】
写了个爬虫,爬取
然后将所抓取数据,导出成excel和csv。
下面把代码分享出来。
供参考。
【FiverComScraper 代码】
1.截图:
2.完整项目代码下载:
FiverrComScraper_2013-03-08_onlyScrapeFiverrCom_beforeWebsiteChange.7z
3.源码:
(1)frmFiverrComScraper.cs
/* * [File] * frmFiverrComScraper.cs * * [Function] * fiverr.com scrapper * * [Note] * * [Update] * 2013-03-08 * * [Author] * Crifan Li * * [Contact] * https://www.crifan.org/contact_me/ * */ using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Text; using System.Windows.Forms; using Sgml; using System.Xml; using System.IO; using System.Web; using Excel = Microsoft.Office.Interop.Excel; using Microsoft.Office.Interop.Excel; /* * icons: * * search/find * http://www.easyicon.cn/icondetail/106/ * * stop * http://www.easyicon.cn/icondetail/568811/ * * excel * http://www.easyicon.cn/icondetail/1087666/ * * csv * http://www.easyicon.cn/icondetail/558199/ * * help * http://www.easyicon.cn/icondetail/12270/ */ namespace FiverComScraper { public partial class frmFiverrComScraper : Form { public crifanLib crifanLib; public DataGridViewButtonColumn gigUrlColumn = null; public static int girUrlColumnIdx = 12; //need get more gig to scrape or not bool needGetMore = true; bool bWorkNotCompleted = true; private string curRespHtml = ""; enum search_status { SEARCH_STATUS_STOPPED, SEARCH_STATUS_SEARCHING, SEARCH_STATUS_PAUSED }; search_status curSearchStatus = search_status.SEARCH_STATUS_STOPPED; public struct search_info { public int pageNum; public string searchUrl; public string searchRespHtml; public XmlDocument xmlDoc; public XmlNamespaceManager m; public XmlNodeList gigDataList; public int nodeIdx; }; search_info curSearchInfo = new search_info(); public frmFiverrComScraper() { AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(CurrentDomain_AssemblyResolve); InitializeComponent(); crifanLib = new crifanLib(); gigUrlColumn = new DataGridViewButtonColumn(); } //for load embedded dll System.Reflection.Assembly CurrentDomain_AssemblyResolve(object sender, ResolveEventArgs args) { string dllName = args.Name.Contains(",") ? args.Name.Substring(0, args.Name.IndexOf(',')) : args.Name.Replace(".dll", ""); dllName = dllName.Replace(".", "_"); if (dllName.EndsWith("_resources")) return null; System.Resources.ResourceManager rm = new System.Resources.ResourceManager(GetType().Namespace + ".Properties.Resources", System.Reflection.Assembly.GetExecutingAssembly()); byte[] bytes = (byte[])rm.GetObject(dllName); return System.Reflection.Assembly.Load(bytes); } private void frmFiverrComScraper_Load(object sender, EventArgs e) { //DataGridView init dgvSearchResult.ColumnCount = 12; dgvSearchResult.RowHeadersWidth = 60; dgvSearchResult.RowHeadersDefaultCellStyle.Alignment = DataGridViewContentAlignment.MiddleCenter; dgvSearchResult.RowHeadersWidthSizeMode = DataGridViewRowHeadersWidthSizeMode.DisableResizing; dgvSearchResult.AutoSizeColumnsMode = DataGridViewAutoSizeColumnsMode.None; dgvSearchResult.AutoSizeRowsMode = DataGridViewAutoSizeRowsMode.AllCellsExceptHeaders; //(1)title dgvSearchResult.Columns[0].AutoSizeMode = DataGridViewAutoSizeColumnMode.Fill; dgvSearchResult.Columns[0].HeaderText = "Title"; dgvSearchResult.Columns[0].Width = 100; //(2)seller rating ( based on 1-100% format ) dgvSearchResult.Columns[1].HeaderText = "Seller Rating"; dgvSearchResult.Columns[1].Width = 49; //(3)estimated delivery ( based on 24 hours - 7days format ) dgvSearchResult.Columns[2].HeaderText = "Estimated Delivery"; dgvSearchResult.Columns[2].Width = 66; //(4)gig rating ( based on 1-100% ) dgvSearchResult.Columns[3].HeaderText = "Gig Rating"; dgvSearchResult.Columns[3].Width = 47; //(5)orders in que ( based on 0-9999 format ) dgvSearchResult.Columns[4].HeaderText = "Orders in Queue"; dgvSearchResult.Columns[4].Width = 54; //(6)level of the seller ( 1-3 ) dgvSearchResult.Columns[5].HeaderText = "Seller Level"; dgvSearchResult.Columns[5].Width = 47; //(7)haz video ( yes or no ) dgvSearchResult.Columns[6].HeaderText = "Has Video"; dgvSearchResult.Columns[6].Width = 42; //(8)express gigs (yes or no ) dgvSearchResult.Columns[7].HeaderText = "Is Express Gig"; dgvSearchResult.Columns[7].Width = 55; //(9)country flag ( display county flag ) dgvSearchResult.Columns[8].HeaderText = "Country Flag"; dgvSearchResult.Columns[8].Width = 106; //(10)+ve reviews and -ve reviews ( based on 1-9999 ) dgvSearchResult.Columns[9].HeaderText = "Positive Reviews"; dgvSearchResult.Columns[9].Width = 57; dgvSearchResult.Columns[10].HeaderText = "Negative Reviews"; dgvSearchResult.Columns[10].Width = 60; //(11)top rated seller ( yes or no ) dgvSearchResult.Columns[11].HeaderText = "Is Top Rated Seller"; dgvSearchResult.Columns[11].Width = 50; ////(12)gig url //dgvSearchResult.Columns[12].HeaderText = "Gig Url"; //dgvSearchResult.Columns[12].Width = 106; // Add a button column gigUrlColumn.HeaderText = "Gig Url"; //gigUrlColumn.Name = "Gig Url name"; gigUrlColumn.Text = "Buy Now"; //gigUrlColumn.UseColumnTextForButtonValue = true; gigUrlColumn.Width = 106; dgvSearchResult.Columns.Add(gigUrlColumn); //this.WindowState = FormWindowState.Maximized; updateUI(); } //update UI according current status private void updateUI() { if (curSearchStatus == search_status.SEARCH_STATUS_STOPPED) { btnSearch.Enabled = true; btnSearch.Text = "Search"; btnPause.Enabled = false; btnStop.Enabled = false; } else if (curSearchStatus == search_status.SEARCH_STATUS_PAUSED) { btnSearch.Enabled = true; btnSearch.Text = "Continue Search"; btnPause.Enabled = false; btnStop.Enabled = true; } else if (curSearchStatus == search_status.SEARCH_STATUS_SEARCHING) { btnSearch.Enabled = false; btnSearch.Text = "Searching"; btnPause.Enabled = true; btnStop.Enabled = true; } } XmlDocument htmlToXmlDoc(string html) { // setup SgmlReader Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; //sgmlReader.InputStream = reader; sgmlReader.InputStream = new StringReader(html); // create document XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); return doc; } private void processEachGig(string gigUrl) { gigInfo singleGigInfo = new gigInfo(); //(12)gig url //gigUrl singleGigInfo.gigUrl = gigUrl; //string gitHtml = crifanLib.getUrlRespHtml(gigUrl); string gitHtml = ""; getUrlRespHtml_bw(gigUrl); while (bWorkNotCompleted) { System.Windows.Forms.Application.DoEvents(); } gitHtml = curRespHtml; XmlDocument xmlDoc = htmlToXmlDoc(gitHtml); XmlNamespaceManager m = new XmlNamespaceManager(xmlDoc.NameTable); m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml"); //(1)title //http://fiverr.com/gamingaffiliate/seo-critique-your-website //<head> // <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> // <meta http-equiv="content-script-type" content="text/javascript"> // <title>Gamingaffiliate will seo critique your website with search engine optimization strategies and tips for $5, only on fiverr.com</title> //XmlNode titleNode = xmlDoc.SelectSingleNode("/w3org:html/w3org:head/w3org:title", m); //<div class="gig-title-g"> // <span itemprop="url" content="http://fiverr.com/gamingaffiliate/seo-critique-your-website"></span> // <h1 itemprop="name"> // I will seo critique your website with search engine optimization strategies and tips for $5 // </h1> // <div class="gig-category-name">CREATED <a href="/archives/2010/7/22">OVER 2 YEARS AGO</a>, IN <a href="/categories/online-marketing">ONLINE MARKETING</a> / <a href="/categories/online-marketing/seo-services">SEO</a> // </div> //</div> XmlNode titleNode = xmlDoc.SelectSingleNode("//w3org:h1[@itemprop='name']", m); string title = titleNode.InnerText; //"\n\t\t\t\t\tI will seo critique your website with search engine optimization strategies and tips for $5\n\t\t\t\t" title = title.Trim(); singleGigInfo.title = title; //(2)seller rating ( based on 1-100% format ) //http://fiverr.com/woofy31/give-you-a-list-with-over-50-best-free-seo-and-sem-tools-out-there //<div class='user-rate'>rated <span class='colored green'>99%</span></div> XmlNode userRateNode = xmlDoc.SelectSingleNode("//w3org:div[@class='user-rate']", m); string userRateTxt = userRateNode.InnerText; string userRateValue = ""; if (crifanLib.extractSingleStr(@"(\d+)%", userRateTxt, out userRateValue)) { int userRateValueInt = Int32.Parse(userRateValue); singleGigInfo.sellerRating = userRateValueInt; } //(3)estimated delivery ( based on 24 hours - 7days format ) //http://fiverr.com/woofy31/give-you-a-list-with-over-50-best-free-seo-and-sem-tools-out-there //<li class="delv-time"> // <div> // <span class='big-txt'>2</span> <span class='mid-txt'>days</span> // <div class="clear"></div> // </div> // <div class="small-txt"> // EST. DELIVERY // </div> //</li> //http://fiverr.com/crashkron/check-your-website-and-keywords-rankings-and-send-you-a-complete-report-to-help-you-to-improve-your-seo //<li class="delv-time"> // <div> // <span class='big-txt'>24</span> <span class='mid-txt'>hrs</span> // <div class="clear"></div> // </div> // <div class="small-txt"> // <div class='express'>express delivery</div> // </div> //</li> XmlNode delvTimeNode = xmlDoc.SelectSingleNode("//w3org:li[@class='delv-time']", m); XmlNode delvTimeBigTxtNode = delvTimeNode.SelectSingleNode(".//w3org:span[@class='big-txt']", m); string devTimeBigStr = delvTimeBigTxtNode.InnerText; XmlNode delvTimeMidTxtNode = delvTimeNode.SelectSingleNode(".//w3org:span[@class='mid-txt']", m); string devTimeMidStr = delvTimeMidTxtNode.InnerText; singleGigInfo.estimatedDeliveryStr = devTimeBigStr + " " + devTimeMidStr; //(4)gig rating ( based on 1-100% ) //http://fiverr.com/fiverrfanatic/be-your-seo-assistant-for-an-hour //<li class="gig-rating"> // <span class="big-txt"> // 100<span class='mid-txt'>%</span> // </span> // <div class="small-txt max-rate"> // GIG RATING // </div> //</li> XmlNode gigRatingNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='gig-rating']", m); string gitRatingTxt = gigRatingNode.InnerText; //"\n\t\t\t\n\t\t\t\t100%\n\t\t\t\n\t\t\t\n\t\t\t\tGIG RATING\n\t\t\t\n\t" string gitRatingValue = ""; if (crifanLib.extractSingleStr(@"(\d+)%", gitRatingTxt, out gitRatingValue)) { singleGigInfo.gigRating = Int32.Parse(gitRatingValue); } else { //http://fiverr.com/techlinks/provide-you-an-ebook-for-teaching-you-all-tips-and-methods-for-doing-seo-on-your-own //<li class="gig-rating"> // <span class="big-txt not-availale">N/A</span> // <div class="small-txt not-availale">NOT RATED YET</div> //</li> singleGigInfo.gigRating = 0; } //(5)orders in que ( based on 0-9999 format ) //http://fiverr.com/seostar/create-complete-seo-analysis-report-of-your-website //<li class="queue "> // <div class="big-txt">4<span class="mid-txt">in queue</span></div> // <div class="small-txt">ORDERS</div> //</li> XmlNode queueNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='queue ']", m); if (queueNode != null) { //extract value XmlNode queueBigTxtNode = queueNode.SelectSingleNode(".//w3org:div[@class='big-txt']", m); string queueTxtValue = queueBigTxtNode.InnerText; string queueValue = ""; if (crifanLib.extractSingleStr(@"(\d+)", queueTxtValue, out queueValue)) { singleGigInfo.ordersInQueue = Int32.Parse(queueValue); } } else { //should be: //http://fiverr.com/woofy31/give-you-a-list-with-over-50-best-free-seo-and-sem-tools-out-there //<li class="queue not-availale"> // <div class="big-txt">0<span class="mid-txt">in queue</span></div> // <div class="small-txt">ORDERS</div> //</li> XmlNode queueNoneNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='queue not-availale']", m); if (queueNoneNode != null) { //ok singleGigInfo.ordersInQueue = 0; } else { //seems some error MessageBox.Show("Error while find orders in queue!"); } } //(6)level of the seller ( 1-3 ) //(11)top rated seller ( yes or no ) //http://fiverr.com/fiverrfanatic/be-your-seo-assistant-for-an-hour //<li class="badge-container top_rated_seller"> // <img alt="Gig_stats_badges" src="/assets/gig_show/gig_stats_badges.png" /> </li> //http://fiverr.com/maxsimpson/create-500-high-pr-seo-backlinks-for-your-web-page-which-are-google-panda-and-penguin-safe-backlink-will-ping-back-links //<li class="badge-container level_two_seller"> // <img alt="Gig_stats_badges" src="/assets/gig_show/gig_stats_badges.png" /> </li> //http://fiverr.com/seostar/create-complete-seo-analysis-report-of-your-website //Not contain any badge-container XmlNode badgeLevelOneNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='badge-container level_one_seller']", m); XmlNode badgeLevelTwoNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='badge-container level_two_seller']", m); XmlNode badgeTopRatedNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='badge-container top_rated_seller']", m); int badgeLevel = 0; bool isTopRatedSeller = false; if ((badgeLevelOneNode == null) && (badgeLevelTwoNode == null) && (badgeTopRatedNode == null)) { badgeLevel = 0; } else if (badgeLevelOneNode != null) { badgeLevel = 1; } else if (badgeLevelTwoNode != null) { badgeLevel = 2; } else if (badgeTopRatedNode != null) { badgeLevel = 3; isTopRatedSeller = true; } singleGigInfo.sellerLevel = badgeLevel; singleGigInfo.isTopRatedSeller = isTopRatedSeller; //(7)haz video ( yes or no ) //http://fiverr.com/hdsmith7674/write-a-high-quality-100-to-300-word-blog-post-or-article //<div class="play-trigger"> // <a href="http://api.dmcloud.net/embed/4e5bf73e94a6f629c900461b/5044c9e794739936f100011b?auth=1519213997-0-r6qkysc4-b4645d9babf33e282ff8f66fbab95c75&wmode=transparent" class="vid-play"></a> // <img alt="alt_text.html_safe" src="http://static.dmcloud.net/4e5bf73e94a6f629c900461b/5044c9e794739936f100011b/jpeg_thumbnail_large-1346739574.jpeg" width="100%" /> //</div> XmlNode playTriggerNode = xmlDoc.SelectSingleNode(".//w3org:div[@class='play-trigger']", m); bool hasVideo = false; if (playTriggerNode != null) { hasVideo = true; } singleGigInfo.hasVideo = hasVideo; //(8)express gigs (yes or no ) //http://fiverr.com/earnonlinemoney/give-you-a-guest-post-on-my-pr2-pets-pet-carepet-trainingpet-nutrition-and-supplement-seomoz-page-authority-of-41 //<div class='express'>EXPRESS DELIVERY</div> XmlNode expressNode = xmlDoc.SelectSingleNode(".//w3org:div[@class='express']", m); bool isExpress = false; if (expressNode != null) { isExpress = true; } singleGigInfo.isExpressGig = isExpress; //(9)country flag ( display county flag ) //http://fiverr.com/maxsimpson/create-500-high-pr-seo-backlinks-for-your-web-page-which-are-google-panda-and-penguin-safe-backlink-will-ping-back-links //<li class="user-det"> // <img src="/assets/02-68c5bd24e80eda13bef308cc3381a6a0.gif" width="50px" height="50px" align="left" class="user-photo" alt="maxsimpson" /> <div> // By <a href="/maxsimpson">maxsimpson</a> <div class='user-rate'>rated <span class='colored green'>98%</span></div> // <span class='flag in' title="India"></span> // </div> //</li> //http://fiverr.com/earnonlinemoney/give-you-a-guest-post-on-my-pr2-pets-pet-carepet-trainingpet-nutrition-and-supplement-seomoz-page-authority-of-41 //<li class="user-det"> // <img src="http://cdn0.fiverrcdn.com/photos/268438/thumb/dollar_sign.jpg?1307287478" width="50px" height="50px" align="left" class="user-photo" alt="earnonlinemoney" /> <div> // By <a href="/earnonlinemoney">earnonlinemoney</a> <div class='user-rate'>rated <span class='colored green'>100%</span></div> // <span class='flag us' title="United States"></span> // </div> //</li> //http://fiverr.com/daica85/give-you-an-advance-seo-techniques-ebook //<li class="user-det"> // <img src="http://cdn3.fiverrcdn.com/photos/68219/thumb/tiphu.jpg?1280070107" width="50px" height="50px" align="left" class="user-photo" alt="daica85" /> <div> // By <a href="/daica85">daica85</a> <div class='user-rate'>rated <span class='colored green'>100%</span></div> // <span class='flag vn' title="Viet Nam"></span> // </div> //</li> XmlNode userDetNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='user-det']", m); string userDetXmlTxt = userDetNode.InnerXml; string countryTxt = ""; //if (crifanLib.extractSingleStr(@"<span class='flag \w+' title=""(\w+)""", userDetXmlTxt, out countryTxt)) //if (crifanLib.extractSingleStr(@"<span class=""flag \w+"" title=""(\w+)""", userDetXmlTxt, out countryTxt)) if (crifanLib.extractSingleStr(@"<span class=""flag \w+"" title=""([a-zA-Z ]+)""", userDetXmlTxt, out countryTxt)) { singleGigInfo.coutryFlag = countryTxt; } else { //MessageBox.Show("Error while find country flag"); } //(10)+ve reviews and -ve reviews ( based on 1-9999 ) //http://fiverr.com/hdsmith7674/write-a-high-quality-100-to-300-word-blog-post-or-article // <li class="thumbs"> // <div class="gig-stats-numbers"><span itemprop="ratingValue" content="5.0">684</span></div> // <div class="thumb"></div> // <br class="clear" /> // <div class="gig-stats-text">POSITIVE REVIEWS</div> // </li> // <li class="thumbs"> // <div class="gig-stats-numbers"><span itemprop="reviewCount" content="690">6</span></div> // <div class="down"><span class="thumb"></span></div> // <br class="clear" /> // <div class="gig-stats-text">NEGATIVE REVIEWS</div> // </li> //<li class="thumbs stars"> // <div class="gig-stats-numbers">437</div> // <div class="stat-heart heart collected"></div> // <br class="clear" /> // <div class="gig-stats-text">COLLECTED THIS GIG</div> //</li> XmlNode positiveNode = xmlDoc.SelectSingleNode(".//w3org:span[@itemprop='ratingValue']", m); XmlNode negativeNode = xmlDoc.SelectSingleNode(".//w3org:span[@itemprop='reviewCount']", m); if ((positiveNode != null) && (negativeNode != null)) { string posibiteValue = positiveNode.InnerText; singleGigInfo.positiveReviews = Int32.Parse(posibiteValue); string negativeValue = negativeNode.InnerText; singleGigInfo.negativeReviews = Int32.Parse(negativeValue); } else { //http://fiverr.com/techlinks/provide-you-an-ebook-for-teaching-you-all-tips-and-methods-for-doing-seo-on-your-own //http://fiverr.com/submitdirectory/do-seo-and-manually-submit-your-business-site-url-to-100-pr3-to-pr7-directories //no POSITIVE REVIEWS and NEGATIVE REVIEWS singleGigInfo.positiveReviews = 0; singleGigInfo.negativeReviews = 0; } storeGigInfo(singleGigInfo); //update UI System.Windows.Forms.Application.DoEvents(); } public struct gigInfo { public string title; public int sellerRating; public string estimatedDeliveryStr; public int gigRating; public int ordersInQueue; public int sellerLevel; public bool hasVideo; public bool isExpressGig; public string coutryFlag; public int positiveReviews; public int negativeReviews; public bool isTopRatedSeller; public string gigUrl; }; private void getUrlRespHtml_bw(string url) { // Create a background thread BackgroundWorker m_bgWorker = new BackgroundWorker(); m_bgWorker.DoWork += new DoWorkEventHandler(m_bgWorker_DoWork); m_bgWorker.RunWorkerCompleted += new RunWorkerCompletedEventHandler ( m_bgWorker_RunWorkerCompleted ); //init bWorkNotCompleted = true; // run in another thread m_bgWorker.RunWorkerAsync(url); } private void m_bgWorker_DoWork(object sender, DoWorkEventArgs e) { string url = (string)e.Argument; e.Result = crifanLib.getUrlRespHtml(url); } void m_bgWorker_ProgressChanged(object sender, ProgressChangedEventArgs e) { bWorkNotCompleted = true; } private void m_bgWorker_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e) { // The background process is complete. We need to inspect // our response to see if an error occurred, a cancel was // requested or if we completed successfully. // Check to see if an error occurred in the // background process. if (e.Error != null) { //MessageBox.Show(e.Error.Message); return; } // Check to see if the background process was cancelled. if (e.Cancelled) { //MessageBox.Show("Cancelled ..."); } else { bWorkNotCompleted = false; // Everything completed normally. // process the response using e.Result //MessageBox.Show("Completed..."); curRespHtml = e.Result.ToString(); } } private void btnSearch_Click(object sender, EventArgs e) { string fiverMainUrl = "http://fiverr.com"; if (curSearchStatus == search_status.SEARCH_STATUS_PAUSED) { needGetMore = true; //restore status //continue search curSearchStatus = search_status.SEARCH_STATUS_SEARCHING; updateUI(); //curSearchInfo = curSearchInfo; //for debug //int debugNum = 0; //int debugMaxNum = 3; //foreach (XmlNode gigNode in gigDataList) for (; curSearchInfo.nodeIdx < curSearchInfo.gigDataList.Count; curSearchInfo.nodeIdx++) { XmlNode gigNode = curSearchInfo.gigDataList[curSearchInfo.nodeIdx]; if (needGetMore) { //<div class="gig-title approved"> //XmlNode gitTitleNode = gigNode.SelectSingleNode(".//div[@class='gig-title approved']"); //null XmlNode gitTitleNode = gigNode.SelectSingleNode(".//w3org:div[@class='gig-title approved']", curSearchInfo.m); //XmlNode h2ANode = gitTitleNode.ChildNodes[1].FirstChild; //XmlNode h2Node = gitTitleNode.SelectSingleNode(".//w3org:h2", curSearchInfo.m); //XmlNode h2ANode = h2Node.SelectSingleNode(".//w3org:a", curSearchInfo.m); XmlNode h2ANode = gitTitleNode.SelectSingleNode(".//w3org:h2/w3org:a", curSearchInfo.m); string gitTitleStr = h2ANode.InnerText; //"I will give you an Advance SEO Techniques eBook for $5" string aHref = h2ANode.Attributes["href"].Value; // /daica85/give-you-an-advance-seo-techniques-ebook string singleGigUrl = fiverMainUrl + aHref; processEachGig(singleGigUrl); ////for debug //debugNum++; //if (debugNum >= debugMaxNum) //{ // //debug // needGetMore = false; // break; //} } else { break; } } //update for next page curSearchInfo.pageNum++; } else if (curSearchStatus == search_status.SEARCH_STATUS_STOPPED) { // new search -> clear previously searched result clearSearchResult(); curSearchStatus = search_status.SEARCH_STATUS_SEARCHING; updateUI(); curSearchInfo = new search_info(); curSearchInfo.pageNum = 1; needGetMore = true; } else { //unexpected status return; } while (needGetMore) { //http://fiverr.com/gigs/search?utf8=%E2%9C%93&query=seo&x=15&y=13&page=2 curSearchInfo.searchUrl = "http://fiverr.com/gigs/search?utf8=%E2%9C%93" + "&query=" + HttpUtility.UrlEncode(txbKeyword.Text) + "&page=" + curSearchInfo.pageNum.ToString(); //string searchResultHtml = crifanLib.getUrlRespHtml(curSearchInfo.searchUrl); getUrlRespHtml_bw(curSearchInfo.searchUrl); while (bWorkNotCompleted) { System.Windows.Forms.Application.DoEvents(); } curSearchInfo.searchRespHtml = curRespHtml; curSearchInfo.xmlDoc = htmlToXmlDoc(curSearchInfo.searchRespHtml); curSearchInfo.m = new XmlNamespaceManager(curSearchInfo.xmlDoc.NameTable); curSearchInfo.m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml"); curSearchInfo.gigDataList = curSearchInfo.xmlDoc.SelectNodes("//w3org:div[@data-gig_id]", curSearchInfo.m); if (curSearchInfo.gigDataList != null) { //for debug //int debugNum = 0; //int debugMaxNum = 3; //foreach (XmlNode gigNode in gigDataList) for (curSearchInfo.nodeIdx = 0; curSearchInfo.nodeIdx < curSearchInfo.gigDataList.Count; curSearchInfo.nodeIdx++) { XmlNode gigNode = curSearchInfo.gigDataList[curSearchInfo.nodeIdx]; if (needGetMore) { //<div class="gig-title approved"> //XmlNode gitTitleNode = gigNode.SelectSingleNode(".//div[@class='gig-title approved']"); //null XmlNode gitTitleNode = gigNode.SelectSingleNode(".//w3org:div[@class='gig-title approved']", curSearchInfo.m); //XmlNode h2ANode = gitTitleNode.ChildNodes[1].FirstChild; //XmlNode h2Node = gitTitleNode.SelectSingleNode(".//w3org:h2", curSearchInfo.m); //XmlNode h2ANode = h2Node.SelectSingleNode(".//w3org:a", curSearchInfo.m); XmlNode h2ANode = gitTitleNode.SelectSingleNode(".//w3org:h2/w3org:a", curSearchInfo.m); string gitTitleStr = h2ANode.InnerText; //"I will give you an Advance SEO Techniques eBook for $5" string aHref = h2ANode.Attributes["href"].Value; // /daica85/give-you-an-advance-seo-techniques-ebook string singleGigUrl = fiverMainUrl + aHref; processEachGig(singleGigUrl); ////for debug //debugNum++; //if (debugNum >= debugMaxNum) //{ // //debug // needGetMore = false; // break; //} } else { break; } } //update for next page curSearchInfo.pageNum++; } else { needGetMore = false; } }; } private void btnPause_Click(object sender, EventArgs e) { if (curSearchStatus == search_status.SEARCH_STATUS_SEARCHING) { curSearchStatus = search_status.SEARCH_STATUS_PAUSED; updateUI(); needGetMore = false; //store current status and progress //MessageBox.Show(curSearchInfo.gigDataList[0].ToString()); } } private void btnStopSearching_Click(object sender, EventArgs e) { if ((curSearchStatus == search_status.SEARCH_STATUS_SEARCHING) || (curSearchStatus == search_status.SEARCH_STATUS_PAUSED) ) { curSearchStatus = search_status.SEARCH_STATUS_STOPPED; updateUI(); needGetMore = false; //clear things } } void storeGigInfo(gigInfo singleGigInfo) { //DataGridViewButtonCell gigUrlCell = new DataGridViewButtonCell(); //gigUrlCell.Value = "Buy Now"; //gigUrlCell.Tag = singleGigInfo.gigUrl; dgvSearchResult.Rows.Add( singleGigInfo.title, singleGigInfo.sellerRating, singleGigInfo.estimatedDeliveryStr, singleGigInfo.gigRating, singleGigInfo.ordersInQueue, singleGigInfo.sellerLevel, singleGigInfo.hasVideo ? "yes" : "no", singleGigInfo.isExpressGig, singleGigInfo.coutryFlag, singleGigInfo.positiveReviews, singleGigInfo.negativeReviews, singleGigInfo.isTopRatedSeller); //gigUrlCell); //singleGigInfo.gigUrl); gigUrlColumn.DataGridView.Rows[dgvSearchResult.Rows.Count - 1].Cells[girUrlColumnIdx].Value = "Buy Now"; gigUrlColumn.DataGridView.Rows[dgvSearchResult.Rows.Count - 1].Cells[girUrlColumnIdx].Tag = singleGigInfo.gigUrl; dgvSearchResult.Rows[dgvSearchResult.Rows.Count - 1].Selected = true; dgvSearchResult.FirstDisplayedScrollingRowIndex = dgvSearchResult.Rows.Count - 1; //draw the row index for (int count = 0; (count <= (dgvSearchResult.Rows.Count - 1)); count++) { dgvSearchResult.Rows[count].HeaderCell.Value = String.Format("{0}", count + 1); //dgvSearchResult.Rows[count].HeaderCell.Value = string.Format((count + 1).ToString(), "0"); } return; } private void dgvSearchResult_CellContentClick(object sender, DataGridViewCellEventArgs e) { if ((e.RowIndex >= 0) && (e.ColumnIndex == girUrlColumnIdx)) { DataGridViewButtonCell clickedButtonCell = (DataGridViewButtonCell)dgvSearchResult.Rows[e.RowIndex].Cells[e.ColumnIndex]; //MessageBox.Show(clickedButtonCell.Value.ToString() + clickedButtonCell.Tag.ToString()); System.Diagnostics.Process.Start(clickedButtonCell.Tag.ToString()); } } private void releaseObject(object obj) { try { System.Runtime.InteropServices.Marshal.ReleaseComObject(obj); obj = null; } catch (Exception ex) { obj = null; MessageBox.Show("Exception Occured while releasing object " + ex.ToString()); } finally { GC.Collect(); } } private void btnSaveAll_Click(object sender, EventArgs e) { Excel.Application xlApp = new Excel.Application(); Excel.Workbook xlWorkBook; Excel.Worksheet xlWorkSheet; object misValue = System.Reflection.Missing.Value; xlApp = new Excel.ApplicationClass(); xlWorkBook = xlApp.Workbooks.Add(misValue); xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1); int i = 0; int j = 0; //save header for (i = 0; i <= dgvSearchResult.ColumnCount - 1; i++) { xlWorkSheet.Cells[0+1, i+1] = dgvSearchResult.Columns[i].HeaderText; } //save cells for (i = 0; i <= dgvSearchResult.RowCount - 1; i++) { for (j = 0; j <= dgvSearchResult.ColumnCount - 1; j++) { DataGridViewCell cell = dgvSearchResult[j, i]; if (j == girUrlColumnIdx) { xlWorkSheet.Cells[i + 2, j + 1] = cell.Tag.ToString(); } else { xlWorkSheet.Cells[i + 2, j + 1] = cell.Value; } } } //formatting //header to bold Range headerRow = xlWorkSheet.get_Range("1:1", System.Type.Missing); headerRow.Font.Bold = true; string outputFilename = "fiverrComScrapedResult.xls"; string fullFilename = Path.Combine(getSaveFolder(), outputFilename); //xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, misValue, misValue, misValue, misValue, misValue); xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, XlSaveConflictResolution.xlLocalSessionChanges, misValue, misValue, misValue, misValue); xlWorkBook.Close(true, misValue, misValue); xlApp.Quit(); releaseObject(xlWorkSheet); releaseObject(xlWorkBook); releaseObject(xlApp); openFolderAndSelectFile(fullFilename); } private void openFolderAndSelectFile(string fullFilename) { System.Diagnostics.Process.Start("Explorer.exe", "/select," + fullFilename); } private string getSaveFolder() { string saveFolderPath = System.Environment.CurrentDirectory; //fbdSaveFolder.SelectedPath = System.Environment.CurrentDirectory; if (fbdSaveFolder.ShowDialog() == System.Windows.Forms.DialogResult.OK) { saveFolderPath = fbdSaveFolder.SelectedPath; } return saveFolderPath; } private void btnExportToCsv_Click(object sender, EventArgs e) { //settings //string delimiter = "|"; string delimiter = ","; string outputFilename = "fiverrComScrapedResult.csv"; string fullFilename = Path.Combine(getSaveFolder(), outputFilename); StreamWriter csvStreamWriter = new StreamWriter(fullFilename, false, System.Text.Encoding.UTF8); //output header data string strHeader = ""; for (int i = 0; i < dgvSearchResult.Columns.Count; i++) { strHeader += dgvSearchResult.Columns[i].HeaderText + delimiter; } csvStreamWriter.WriteLine(strHeader); //output rows data for (int j = 0; j < dgvSearchResult.Rows.Count; j++) { string strRowValue = ""; for (int k = 0; k < dgvSearchResult.Columns.Count; k++) { if (k == girUrlColumnIdx) { strRowValue += dgvSearchResult.Rows[j].Cells[k].Tag.ToString() + delimiter; } else { strRowValue += dgvSearchResult.Rows[j].Cells[k].Value + delimiter; } } csvStreamWriter.WriteLine(strRowValue); } csvStreamWriter.Close(); //after save file openFolderAndSelectFile(fullFilename); } private void clearSearchResult() { dgvSearchResult.Rows.Clear(); } private void btnClearAll_Click(object sender, EventArgs e) { clearSearchResult(); } private void btnHelp_Click(object sender, EventArgs e) { string helpUrl = "http://giggladiator.com/help"; System.Diagnostics.Process.Start(helpUrl); } } }
(2)
转载请注明:在路上 » 【代码分享】C#代码:FiverComScraper – 只抓取fiverr.com,网站改版之前