最新消息:20210917 已从crifan.com换到crifan.org

【代码分享】C#代码:FiverComScraper – 只抓取fiverr.com,网站改版之前

CodeShare crifan 2309浏览 0评论

【背景】

写了个爬虫,爬取

http://fiverr.com

然后将所抓取数据,导出成excel和csv。

下面把代码分享出来。

供参考。

 

【FiverComScraper 代码】

1.截图:

fiverr com old code only scrape fiverr com

2.完整项目代码下载:

FiverrComScraper_2013-03-08_onlyScrapeFiverrCom_beforeWebsiteChange.7z

 

3.源码:

(1)frmFiverrComScraper.cs

/*
 * [File]
 * frmFiverrComScraper.cs
 * 
 * [Function]
 * fiverr.com scrapper
 * 
 * [Note]
 * 
 * [Update]
 * 2013-03-08
 * 
 * [Author]
 * Crifan Li
 * 
 * [Contact]
 * https://www.crifan.org/contact_me/
 * 
 */

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;

using Sgml;
using System.Xml;
using System.IO;
using System.Web;
using Excel = Microsoft.Office.Interop.Excel;
using Microsoft.Office.Interop.Excel;

/*
 * icons:
 * 
 * search/find
 * http://www.easyicon.cn/icondetail/106/
 * 
 * stop
 * http://www.easyicon.cn/icondetail/568811/
 * 
 * excel
 * http://www.easyicon.cn/icondetail/1087666/
 * 
 * csv
 * http://www.easyicon.cn/icondetail/558199/
 * 
 * help
 * http://www.easyicon.cn/icondetail/12270/
 */

namespace FiverComScraper
{
    public partial class frmFiverrComScraper : Form
    {
        public crifanLib crifanLib;
        public DataGridViewButtonColumn gigUrlColumn = null;
        public static int girUrlColumnIdx = 12;

        //need get more gig to scrape or not
        bool needGetMore = true;

        bool bWorkNotCompleted = true;

        private string curRespHtml = "";

        enum search_status
        {
            SEARCH_STATUS_STOPPED,
            SEARCH_STATUS_SEARCHING,
            SEARCH_STATUS_PAUSED
        };
        search_status curSearchStatus = search_status.SEARCH_STATUS_STOPPED;

        public struct search_info
        {
            public int pageNum;
            public string searchUrl;
            public string searchRespHtml;
            public XmlDocument xmlDoc;
            public XmlNamespaceManager m;
            public XmlNodeList gigDataList;
            public int nodeIdx;

        };
        search_info curSearchInfo = new search_info();
        
        public frmFiverrComScraper()
        {
            AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(CurrentDomain_AssemblyResolve);

            InitializeComponent();

            crifanLib = new crifanLib();
            gigUrlColumn = new DataGridViewButtonColumn();
        }

        //for load embedded dll
        System.Reflection.Assembly CurrentDomain_AssemblyResolve(object sender, ResolveEventArgs args)
        {
            string dllName = args.Name.Contains(",") ? args.Name.Substring(0, args.Name.IndexOf(',')) : args.Name.Replace(".dll", "");

            dllName = dllName.Replace(".", "_");

            if (dllName.EndsWith("_resources")) return null;

            System.Resources.ResourceManager rm = new System.Resources.ResourceManager(GetType().Namespace + ".Properties.Resources", System.Reflection.Assembly.GetExecutingAssembly());

            byte[] bytes = (byte[])rm.GetObject(dllName);

            return System.Reflection.Assembly.Load(bytes);
        }

        private void frmFiverrComScraper_Load(object sender, EventArgs e)
        {
            //DataGridView init
            dgvSearchResult.ColumnCount = 12;

            dgvSearchResult.RowHeadersWidth = 60;
            dgvSearchResult.RowHeadersDefaultCellStyle.Alignment = DataGridViewContentAlignment.MiddleCenter;
            dgvSearchResult.RowHeadersWidthSizeMode = DataGridViewRowHeadersWidthSizeMode.DisableResizing;

            dgvSearchResult.AutoSizeColumnsMode = DataGridViewAutoSizeColumnsMode.None;
            dgvSearchResult.AutoSizeRowsMode = DataGridViewAutoSizeRowsMode.AllCellsExceptHeaders;

            //(1)title
            dgvSearchResult.Columns[0].AutoSizeMode = DataGridViewAutoSizeColumnMode.Fill;
            dgvSearchResult.Columns[0].HeaderText = "Title";
            dgvSearchResult.Columns[0].Width = 100;
            //(2)seller rating ( based on 1-100% format )
            dgvSearchResult.Columns[1].HeaderText = "Seller Rating";
            dgvSearchResult.Columns[1].Width = 49;
            //(3)estimated delivery ( based on 24 hours - 7days format )
            dgvSearchResult.Columns[2].HeaderText = "Estimated Delivery";
            dgvSearchResult.Columns[2].Width = 66;
            //(4)gig rating ( based on 1-100% )
            dgvSearchResult.Columns[3].HeaderText = "Gig Rating";
            dgvSearchResult.Columns[3].Width = 47;
            //(5)orders in que ( based on 0-9999 format )
            dgvSearchResult.Columns[4].HeaderText = "Orders in Queue";
            dgvSearchResult.Columns[4].Width = 54;
            //(6)level of the seller ( 1-3 )
            dgvSearchResult.Columns[5].HeaderText = "Seller Level";
            dgvSearchResult.Columns[5].Width = 47;
            //(7)haz video ( yes or no )
            dgvSearchResult.Columns[6].HeaderText = "Has Video";
            dgvSearchResult.Columns[6].Width = 42;
            //(8)express gigs (yes or no )
            dgvSearchResult.Columns[7].HeaderText = "Is Express Gig";
            dgvSearchResult.Columns[7].Width = 55;
            //(9)country flag ( display county flag )
            dgvSearchResult.Columns[8].HeaderText = "Country Flag";
            dgvSearchResult.Columns[8].Width = 106;
            //(10)+ve reviews and -ve reviews ( based on 1-9999 )
            dgvSearchResult.Columns[9].HeaderText = "Positive Reviews";
            dgvSearchResult.Columns[9].Width = 57;
            dgvSearchResult.Columns[10].HeaderText = "Negative Reviews";
            dgvSearchResult.Columns[10].Width = 60;
            //(11)top rated seller ( yes or no )
            dgvSearchResult.Columns[11].HeaderText = "Is Top Rated Seller";
            dgvSearchResult.Columns[11].Width = 50;
            ////(12)gig url
            //dgvSearchResult.Columns[12].HeaderText = "Gig Url";
            //dgvSearchResult.Columns[12].Width = 106;

            // Add a button column
            gigUrlColumn.HeaderText = "Gig Url";
            //gigUrlColumn.Name = "Gig Url name";
            gigUrlColumn.Text = "Buy Now";
            //gigUrlColumn.UseColumnTextForButtonValue = true;
            gigUrlColumn.Width = 106;
            dgvSearchResult.Columns.Add(gigUrlColumn);

            //this.WindowState = FormWindowState.Maximized;

            updateUI();
        }

        //update UI according current status
        private void updateUI()
        {
            if (curSearchStatus == search_status.SEARCH_STATUS_STOPPED)
            {
                btnSearch.Enabled = true;
                btnSearch.Text = "Search";

                btnPause.Enabled = false;
                btnStop.Enabled = false;

            }
            else if (curSearchStatus == search_status.SEARCH_STATUS_PAUSED)
            {
                btnSearch.Enabled = true;
                btnSearch.Text = "Continue Search";

                btnPause.Enabled = false;
                btnStop.Enabled = true;
            }
            else if (curSearchStatus == search_status.SEARCH_STATUS_SEARCHING)
            {
                btnSearch.Enabled = false;
                btnSearch.Text = "Searching";

                btnPause.Enabled = true;
                btnStop.Enabled = true;
            }
        }

        XmlDocument htmlToXmlDoc(string html)
        {
            // setup SgmlReader
            Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader();
            sgmlReader.DocType = "HTML";
            sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
            sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;
            //sgmlReader.InputStream = reader;
            sgmlReader.InputStream = new StringReader(html);

            // create document
            XmlDocument doc = new XmlDocument();
            doc.PreserveWhitespace = true;
            doc.XmlResolver = null;
            doc.Load(sgmlReader);

            return doc;
        }

        private void processEachGig(string gigUrl)
        {
            gigInfo singleGigInfo = new gigInfo();
            
            //(12)gig url
            //gigUrl
            singleGigInfo.gigUrl = gigUrl;

            //string gitHtml = crifanLib.getUrlRespHtml(gigUrl);
            string gitHtml = "";
            getUrlRespHtml_bw(gigUrl);
            while (bWorkNotCompleted)
            {
                System.Windows.Forms.Application.DoEvents();
            }
            gitHtml = curRespHtml;
            
            XmlDocument xmlDoc = htmlToXmlDoc(gitHtml);

            XmlNamespaceManager m = new XmlNamespaceManager(xmlDoc.NameTable);
            m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml");
            
            //(1)title
            //http://fiverr.com/gamingaffiliate/seo-critique-your-website
              //<head>
              //  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
              //  <meta http-equiv="content-script-type" content="text/javascript">
              //    <title>Gamingaffiliate will seo critique your website with search engine optimization strategies and tips for $5, only on fiverr.com</title>
            //XmlNode titleNode = xmlDoc.SelectSingleNode("/w3org:html/w3org:head/w3org:title", m);

            //<div class="gig-title-g">
            //    <span itemprop="url" content="http://fiverr.com/gamingaffiliate/seo-critique-your-website"></span>
            //    <h1 itemprop="name">
            //        I will seo critique your website with search engine optimization strategies and tips for $5
            //    </h1>
            //    <div class="gig-category-name">CREATED <a href="/archives/2010/7/22">OVER 2 YEARS AGO</a>, IN <a href="/categories/online-marketing">ONLINE MARKETING</a>		  		/ <a href="/categories/online-marketing/seo-services">SEO</a>
            //    </div>
            //</div>
            XmlNode titleNode = xmlDoc.SelectSingleNode("//w3org:h1[@itemprop='name']", m);
            string title = titleNode.InnerText; //"\n\t\t\t\t\tI will seo critique your website with search engine optimization strategies and tips for $5\n\t\t\t\t"
            title = title.Trim();
            singleGigInfo.title = title;

            //(2)seller rating ( based on 1-100% format )
            //http://fiverr.com/woofy31/give-you-a-list-with-over-50-best-free-seo-and-sem-tools-out-there
            //<div class='user-rate'>rated <span class='colored green'>99%</span></div>
            XmlNode userRateNode = xmlDoc.SelectSingleNode("//w3org:div[@class='user-rate']", m);
            string userRateTxt = userRateNode.InnerText;
            string userRateValue = "";
            if (crifanLib.extractSingleStr(@"(\d+)%", userRateTxt, out userRateValue))
            {
                int userRateValueInt = Int32.Parse(userRateValue);
                singleGigInfo.sellerRating = userRateValueInt;
            }

            //(3)estimated delivery ( based on 24 hours - 7days format )
            //http://fiverr.com/woofy31/give-you-a-list-with-over-50-best-free-seo-and-sem-tools-out-there
            //<li class="delv-time">
            //    <div>
            //                    <span class='big-txt'>2</span> <span class='mid-txt'>days</span>
            //        <div class="clear"></div>
            //    </div>
            //    <div class="small-txt">
            //            EST. DELIVERY
            //    </div>
            //</li>

            //http://fiverr.com/crashkron/check-your-website-and-keywords-rankings-and-send-you-a-complete-report-to-help-you-to-improve-your-seo
            //<li class="delv-time">
            //    <div>
            //            <span class='big-txt'>24</span> <span class='mid-txt'>hrs</span>
            //        <div class="clear"></div>
            //    </div>
            //    <div class="small-txt">
            //            <div class='express'>express delivery</div>
            //    </div>
            //</li>
            XmlNode delvTimeNode = xmlDoc.SelectSingleNode("//w3org:li[@class='delv-time']", m);
            XmlNode delvTimeBigTxtNode = delvTimeNode.SelectSingleNode(".//w3org:span[@class='big-txt']", m);
            string devTimeBigStr = delvTimeBigTxtNode.InnerText;
            XmlNode delvTimeMidTxtNode = delvTimeNode.SelectSingleNode(".//w3org:span[@class='mid-txt']", m);
            string devTimeMidStr = delvTimeMidTxtNode.InnerText;
            singleGigInfo.estimatedDeliveryStr = devTimeBigStr + " " + devTimeMidStr;

            //(4)gig rating ( based on 1-100% )
            //http://fiverr.com/fiverrfanatic/be-your-seo-assistant-for-an-hour
            //<li class="gig-rating">
            //        <span class="big-txt">
            //            100<span class='mid-txt'>&#37;</span>
            //        </span>
            //        <div class="small-txt max-rate">
            //            GIG RATING
            //        </div>
            //</li>


            XmlNode gigRatingNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='gig-rating']", m);
            string gitRatingTxt = gigRatingNode.InnerText; //"\n\t\t\t\n\t\t\t\t100%\n\t\t\t\n\t\t\t\n\t\t\t\tGIG RATING\n\t\t\t\n\t"
            string gitRatingValue = "";
            if (crifanLib.extractSingleStr(@"(\d+)%", gitRatingTxt, out gitRatingValue))
            {
                singleGigInfo.gigRating = Int32.Parse(gitRatingValue);
            }
            else
            {
                //http://fiverr.com/techlinks/provide-you-an-ebook-for-teaching-you-all-tips-and-methods-for-doing-seo-on-your-own
                //<li class="gig-rating">
                //        <span class="big-txt not-availale">N/A</span>
                //        <div class="small-txt not-availale">NOT RATED YET</div>
                //</li>
                singleGigInfo.gigRating = 0;
            }

            //(5)orders in que ( based on 0-9999 format )
            //http://fiverr.com/seostar/create-complete-seo-analysis-report-of-your-website
            //<li class="queue ">
            //        <div class="big-txt">4<span class="mid-txt">in queue</span></div>
            //        <div class="small-txt">ORDERS</div>
            //</li>  

            XmlNode queueNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='queue ']", m);
            if (queueNode != null)
            {
                //extract value
                XmlNode queueBigTxtNode = queueNode.SelectSingleNode(".//w3org:div[@class='big-txt']", m);
                string queueTxtValue = queueBigTxtNode.InnerText;
                string queueValue = "";
                if (crifanLib.extractSingleStr(@"(\d+)", queueTxtValue, out queueValue))
                {
                    singleGigInfo.ordersInQueue = Int32.Parse(queueValue);
                }
            }
            else
            {
                //should be:
                //http://fiverr.com/woofy31/give-you-a-list-with-over-50-best-free-seo-and-sem-tools-out-there
                //<li class="queue not-availale">
                //        <div class="big-txt">0<span class="mid-txt">in queue</span></div>
                //        <div class="small-txt">ORDERS</div>
                //</li>  
                XmlNode queueNoneNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='queue not-availale']", m);
                if (queueNoneNode != null)
                {
                    //ok
                    singleGigInfo.ordersInQueue = 0;
                }
                else 
                {
                    //seems some error
                    MessageBox.Show("Error while find orders in queue!");
                }
            }

            //(6)level of the seller ( 1-3 )
            //(11)top rated seller ( yes or no )
            //http://fiverr.com/fiverrfanatic/be-your-seo-assistant-for-an-hour
            //<li class="badge-container top_rated_seller">
            //    <img alt="Gig_stats_badges" src="/assets/gig_show/gig_stats_badges.png" />		</li>

            //http://fiverr.com/maxsimpson/create-500-high-pr-seo-backlinks-for-your-web-page-which-are-google-panda-and-penguin-safe-backlink-will-ping-back-links
            //<li class="badge-container level_two_seller">
            //    <img alt="Gig_stats_badges" src="/assets/gig_show/gig_stats_badges.png" />		</li>

            //http://fiverr.com/seostar/create-complete-seo-analysis-report-of-your-website
            //Not contain any badge-container
            XmlNode badgeLevelOneNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='badge-container level_one_seller']", m);
            XmlNode badgeLevelTwoNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='badge-container level_two_seller']", m);
            XmlNode badgeTopRatedNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='badge-container top_rated_seller']", m);

            int badgeLevel = 0;
            bool isTopRatedSeller = false;
            if ((badgeLevelOneNode == null) && (badgeLevelTwoNode == null) && (badgeTopRatedNode == null))
            {
                badgeLevel = 0;
            }
            else if (badgeLevelOneNode != null)
            {
                badgeLevel = 1;
            }
            else if (badgeLevelTwoNode != null)
            {
                badgeLevel = 2;
            }
            else if (badgeTopRatedNode != null)
            {
                badgeLevel = 3;
                isTopRatedSeller = true;
            }

            singleGigInfo.sellerLevel = badgeLevel;
            singleGigInfo.isTopRatedSeller = isTopRatedSeller;


            //(7)haz video ( yes or no )
            //http://fiverr.com/hdsmith7674/write-a-high-quality-100-to-300-word-blog-post-or-article
            //<div class="play-trigger">
            //    <a href="http://api.dmcloud.net/embed/4e5bf73e94a6f629c900461b/5044c9e794739936f100011b?auth=1519213997-0-r6qkysc4-b4645d9babf33e282ff8f66fbab95c75&amp;wmode=transparent" class="vid-play"></a>
            //    <img alt="alt_text.html_safe" src="http://static.dmcloud.net/4e5bf73e94a6f629c900461b/5044c9e794739936f100011b/jpeg_thumbnail_large-1346739574.jpeg" width="100%" />
            //</div>
            XmlNode playTriggerNode = xmlDoc.SelectSingleNode(".//w3org:div[@class='play-trigger']", m);
            bool hasVideo = false;
            if (playTriggerNode != null)
            {
                hasVideo = true;
            }
            singleGigInfo.hasVideo = hasVideo;


            //(8)express gigs (yes or no )
            //http://fiverr.com/earnonlinemoney/give-you-a-guest-post-on-my-pr2-pets-pet-carepet-trainingpet-nutrition-and-supplement-seomoz-page-authority-of-41
            //<div class='express'>EXPRESS DELIVERY</div>
            XmlNode expressNode = xmlDoc.SelectSingleNode(".//w3org:div[@class='express']", m);
            bool isExpress = false;
            if (expressNode != null)
            {
                isExpress = true;
            }
            singleGigInfo.isExpressGig = isExpress;

            //(9)country flag ( display county flag )
            
            //http://fiverr.com/maxsimpson/create-500-high-pr-seo-backlinks-for-your-web-page-which-are-google-panda-and-penguin-safe-backlink-will-ping-back-links
            //<li class="user-det">
            //    <img src="/assets/02-68c5bd24e80eda13bef308cc3381a6a0.gif" width="50px" height="50px" align="left" class="user-photo" alt="maxsimpson" />		<div>
            //        By <a href="/maxsimpson">maxsimpson</a>			<div class='user-rate'>rated <span class='colored green'>98%</span></div>
            //        <span class='flag in' title="India"></span>
            //    </div>
            //</li>

            //http://fiverr.com/earnonlinemoney/give-you-a-guest-post-on-my-pr2-pets-pet-carepet-trainingpet-nutrition-and-supplement-seomoz-page-authority-of-41
            //<li class="user-det">
            //    <img src="http://cdn0.fiverrcdn.com/photos/268438/thumb/dollar_sign.jpg?1307287478" width="50px" height="50px" align="left" class="user-photo" alt="earnonlinemoney" />		<div>
            //        By <a href="/earnonlinemoney">earnonlinemoney</a>			<div class='user-rate'>rated <span class='colored green'>100%</span></div>
            //        <span class='flag us' title="United States"></span>
            //    </div>
            //</li>

            //http://fiverr.com/daica85/give-you-an-advance-seo-techniques-ebook
            //<li class="user-det">
            //    <img src="http://cdn3.fiverrcdn.com/photos/68219/thumb/tiphu.jpg?1280070107" width="50px" height="50px" align="left" class="user-photo" alt="daica85" />		<div>
            //        By <a href="/daica85">daica85</a>			<div class='user-rate'>rated <span class='colored green'>100%</span></div>
            //        <span class='flag vn' title="Viet Nam"></span>
            //    </div>
            //</li>

            XmlNode userDetNode = xmlDoc.SelectSingleNode(".//w3org:li[@class='user-det']", m);
            string userDetXmlTxt = userDetNode.InnerXml;
            string countryTxt = "";
            //if (crifanLib.extractSingleStr(@"<span class='flag \w+' title=""(\w+)""", userDetXmlTxt, out countryTxt))
            //if (crifanLib.extractSingleStr(@"<span class=""flag \w+"" title=""(\w+)""", userDetXmlTxt, out countryTxt))
            if (crifanLib.extractSingleStr(@"<span class=""flag \w+"" title=""([a-zA-Z ]+)""", userDetXmlTxt, out countryTxt))
            {
                singleGigInfo.coutryFlag = countryTxt;
            }
            else
            {
                //MessageBox.Show("Error while find country flag");
            }

            
            //(10)+ve reviews and -ve reviews ( based on 1-9999 )
            //http://fiverr.com/hdsmith7674/write-a-high-quality-100-to-300-word-blog-post-or-article
            //  <li class="thumbs">
            //    <div class="gig-stats-numbers"><span itemprop="ratingValue" content="5.0">684</span></div>
            //    <div class="thumb"></div>
            //    <br class="clear" />
            //    <div class="gig-stats-text">POSITIVE REVIEWS</div>
            //  </li>
            //  <li class="thumbs">
            //    <div class="gig-stats-numbers"><span itemprop="reviewCount" content="690">6</span></div>
            //    <div class="down"><span class="thumb"></span></div>
            //    <br class="clear" />
            //    <div class="gig-stats-text">NEGATIVE REVIEWS</div>
            //  </li>
            //<li class="thumbs stars">
            //  <div class="gig-stats-numbers">437</div>
            //  <div class="stat-heart heart collected"></div>
            //  <br class="clear" />
            //  <div class="gig-stats-text">COLLECTED THIS GIG</div>
            //</li>

            XmlNode positiveNode = xmlDoc.SelectSingleNode(".//w3org:span[@itemprop='ratingValue']", m);
            XmlNode negativeNode = xmlDoc.SelectSingleNode(".//w3org:span[@itemprop='reviewCount']", m);

            if ((positiveNode != null) && (negativeNode != null))
            {
                string posibiteValue = positiveNode.InnerText;
                singleGigInfo.positiveReviews = Int32.Parse(posibiteValue);
                
                string negativeValue = negativeNode.InnerText;
                singleGigInfo.negativeReviews = Int32.Parse(negativeValue);
            }
            else 
            {
                //http://fiverr.com/techlinks/provide-you-an-ebook-for-teaching-you-all-tips-and-methods-for-doing-seo-on-your-own
                //http://fiverr.com/submitdirectory/do-seo-and-manually-submit-your-business-site-url-to-100-pr3-to-pr7-directories
                //no POSITIVE REVIEWS and NEGATIVE REVIEWS
                singleGigInfo.positiveReviews = 0;
                singleGigInfo.negativeReviews = 0;
            }
                        
            storeGigInfo(singleGigInfo);

            //update UI
            System.Windows.Forms.Application.DoEvents();
        }

        public struct gigInfo
        {
            public string title;
            public int sellerRating;
            public string estimatedDeliveryStr;
            public int gigRating;
            public int ordersInQueue;
            public int sellerLevel;
            public bool hasVideo;
            public bool isExpressGig;
            public string coutryFlag;
            public int positiveReviews;
            public int negativeReviews;
            public bool isTopRatedSeller;
            public string gigUrl;
        };

        private void getUrlRespHtml_bw(string url)
        {
            // Create a background thread
            BackgroundWorker m_bgWorker = new BackgroundWorker();
            m_bgWorker.DoWork += new DoWorkEventHandler(m_bgWorker_DoWork);
            m_bgWorker.RunWorkerCompleted += new RunWorkerCompletedEventHandler
                        ( m_bgWorker_RunWorkerCompleted );

            //init
            bWorkNotCompleted = true;
            
            // run in another thread
            m_bgWorker.RunWorkerAsync(url);
        }

        private void m_bgWorker_DoWork(object sender, DoWorkEventArgs e)
        {
            string url = (string)e.Argument;
            e.Result = crifanLib.getUrlRespHtml(url);
        }

        void m_bgWorker_ProgressChanged(object sender, ProgressChangedEventArgs e)
        {
            bWorkNotCompleted = true;
        }

        private void m_bgWorker_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e)
        {
            // The background process is complete. We need to inspect
            // our response to see if an error occurred, a cancel was
            // requested or if we completed successfully.

            // Check to see if an error occurred in the
            // background process.
            if (e.Error != null)
            {
                //MessageBox.Show(e.Error.Message);
                return;
            }

            // Check to see if the background process was cancelled.
            if (e.Cancelled)
            {
                //MessageBox.Show("Cancelled ...");
            }
            else
            {
                bWorkNotCompleted = false;

                // Everything completed normally.
                // process the response using e.Result
                //MessageBox.Show("Completed...");
                curRespHtml = e.Result.ToString();
            }
        }

        private void btnSearch_Click(object sender, EventArgs e)
        {
            string fiverMainUrl = "http://fiverr.com";
            
            if (curSearchStatus == search_status.SEARCH_STATUS_PAUSED)
            {
                needGetMore = true;
                //restore status

                //continue search

                curSearchStatus = search_status.SEARCH_STATUS_SEARCHING;
                updateUI();

                //curSearchInfo = curSearchInfo;
                
                //for debug
                //int debugNum = 0;
                //int debugMaxNum = 3;

                //foreach (XmlNode gigNode in gigDataList)
                for (; curSearchInfo.nodeIdx < curSearchInfo.gigDataList.Count; curSearchInfo.nodeIdx++)
                {
                    XmlNode gigNode = curSearchInfo.gigDataList[curSearchInfo.nodeIdx];
                    if (needGetMore)
                    {
                        //<div class="gig-title approved">
                        //XmlNode gitTitleNode = gigNode.SelectSingleNode(".//div[@class='gig-title approved']"); //null
                        XmlNode gitTitleNode = gigNode.SelectSingleNode(".//w3org:div[@class='gig-title approved']", curSearchInfo.m);

                        //XmlNode h2ANode = gitTitleNode.ChildNodes[1].FirstChild;

                        //XmlNode h2Node = gitTitleNode.SelectSingleNode(".//w3org:h2", curSearchInfo.m);
                        //XmlNode h2ANode = h2Node.SelectSingleNode(".//w3org:a", curSearchInfo.m);

                        XmlNode h2ANode = gitTitleNode.SelectSingleNode(".//w3org:h2/w3org:a", curSearchInfo.m);
                        string gitTitleStr = h2ANode.InnerText; //"I will give you an Advance SEO Techniques eBook for $5"
                        string aHref = h2ANode.Attributes["href"].Value; // /daica85/give-you-an-advance-seo-techniques-ebook
                        string singleGigUrl = fiverMainUrl + aHref;

                        processEachGig(singleGigUrl);

                        ////for debug
                        //debugNum++;
                        //if (debugNum >= debugMaxNum)
                        //{
                        //    //debug
                        //    needGetMore = false;
                        //    break;
                        //}
                    }
                    else
                    {
                        break;
                    }
                }

                //update for next page
                curSearchInfo.pageNum++;
            }
            else if (curSearchStatus == search_status.SEARCH_STATUS_STOPPED)
            {
                // new search -> clear previously searched result
                clearSearchResult();

                curSearchStatus = search_status.SEARCH_STATUS_SEARCHING;
                updateUI();

                curSearchInfo = new search_info();

                curSearchInfo.pageNum = 1;

                needGetMore = true;
            }
            else
            {
                //unexpected status

                return;
            }

            while (needGetMore)
            {
                //http://fiverr.com/gigs/search?utf8=%E2%9C%93&query=seo&x=15&y=13&page=2
                curSearchInfo.searchUrl = "http://fiverr.com/gigs/search?utf8=%E2%9C%93"
                    + "&query=" + HttpUtility.UrlEncode(txbKeyword.Text)
                    + "&page=" + curSearchInfo.pageNum.ToString();

                //string searchResultHtml = crifanLib.getUrlRespHtml(curSearchInfo.searchUrl);
                getUrlRespHtml_bw(curSearchInfo.searchUrl);
                while (bWorkNotCompleted)
                {
                    System.Windows.Forms.Application.DoEvents();
                }
                curSearchInfo.searchRespHtml = curRespHtml;

                curSearchInfo.xmlDoc = htmlToXmlDoc(curSearchInfo.searchRespHtml);

                curSearchInfo.m = new XmlNamespaceManager(curSearchInfo.xmlDoc.NameTable);
                curSearchInfo.m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml");

                curSearchInfo.gigDataList = curSearchInfo.xmlDoc.SelectNodes("//w3org:div[@data-gig_id]", curSearchInfo.m);

                if (curSearchInfo.gigDataList != null)
                {
                    //for debug
                    //int debugNum = 0;
                    //int debugMaxNum = 3;

                    //foreach (XmlNode gigNode in gigDataList)
                    for (curSearchInfo.nodeIdx = 0; curSearchInfo.nodeIdx < curSearchInfo.gigDataList.Count; curSearchInfo.nodeIdx++)
                    {
                        XmlNode gigNode = curSearchInfo.gigDataList[curSearchInfo.nodeIdx];
                        if (needGetMore)
                        {
                            //<div class="gig-title approved">
                            //XmlNode gitTitleNode = gigNode.SelectSingleNode(".//div[@class='gig-title approved']"); //null
                            XmlNode gitTitleNode = gigNode.SelectSingleNode(".//w3org:div[@class='gig-title approved']", curSearchInfo.m);

                            //XmlNode h2ANode = gitTitleNode.ChildNodes[1].FirstChild;

                            //XmlNode h2Node = gitTitleNode.SelectSingleNode(".//w3org:h2", curSearchInfo.m);
                            //XmlNode h2ANode = h2Node.SelectSingleNode(".//w3org:a", curSearchInfo.m);

                            XmlNode h2ANode = gitTitleNode.SelectSingleNode(".//w3org:h2/w3org:a", curSearchInfo.m);
                            string gitTitleStr = h2ANode.InnerText; //"I will give you an Advance SEO Techniques eBook for $5"
                            string aHref = h2ANode.Attributes["href"].Value; // /daica85/give-you-an-advance-seo-techniques-ebook
                            string singleGigUrl = fiverMainUrl + aHref;

                            processEachGig(singleGigUrl);

                            ////for debug
                            //debugNum++;
                            //if (debugNum >= debugMaxNum)
                            //{
                            //    //debug
                            //    needGetMore = false;
                            //    break;
                            //}
                        }
                        else
                        {
                            break;
                        }
                    }

                    //update for next page
                    curSearchInfo.pageNum++;
                }
                else
                {
                    needGetMore = false;
                }
            };

        }

        private void btnPause_Click(object sender, EventArgs e)
        {
            if (curSearchStatus == search_status.SEARCH_STATUS_SEARCHING)
            {
                curSearchStatus = search_status.SEARCH_STATUS_PAUSED;
                updateUI();

                needGetMore = false;

                //store current status and progress

                //MessageBox.Show(curSearchInfo.gigDataList[0].ToString());
            }

        }

        private void btnStopSearching_Click(object sender, EventArgs e)
        {
            if ((curSearchStatus == search_status.SEARCH_STATUS_SEARCHING) ||
                (curSearchStatus == search_status.SEARCH_STATUS_PAUSED)
                )
            {
                curSearchStatus = search_status.SEARCH_STATUS_STOPPED;
                updateUI();

                needGetMore = false;

                //clear things
            }
        }

        void storeGigInfo(gigInfo singleGigInfo)
        {
            //DataGridViewButtonCell gigUrlCell = new DataGridViewButtonCell();
            //gigUrlCell.Value = "Buy Now";
            //gigUrlCell.Tag = singleGigInfo.gigUrl;

            dgvSearchResult.Rows.Add(
                singleGigInfo.title,
                singleGigInfo.sellerRating,
                singleGigInfo.estimatedDeliveryStr,
                singleGigInfo.gigRating,
                singleGigInfo.ordersInQueue,
                singleGigInfo.sellerLevel,
                singleGigInfo.hasVideo ? "yes" : "no",
                singleGigInfo.isExpressGig,
                singleGigInfo.coutryFlag,
                singleGigInfo.positiveReviews,
                singleGigInfo.negativeReviews,
                singleGigInfo.isTopRatedSeller);
                //gigUrlCell);
                //singleGigInfo.gigUrl);

            gigUrlColumn.DataGridView.Rows[dgvSearchResult.Rows.Count - 1].Cells[girUrlColumnIdx].Value = "Buy Now";
            gigUrlColumn.DataGridView.Rows[dgvSearchResult.Rows.Count - 1].Cells[girUrlColumnIdx].Tag = singleGigInfo.gigUrl;
            
            dgvSearchResult.Rows[dgvSearchResult.Rows.Count - 1].Selected = true;
            dgvSearchResult.FirstDisplayedScrollingRowIndex = dgvSearchResult.Rows.Count - 1;

            //draw the row index
            for (int count = 0; (count <= (dgvSearchResult.Rows.Count - 1)); count++)
            {
                dgvSearchResult.Rows[count].HeaderCell.Value = String.Format("{0}", count + 1);
                //dgvSearchResult.Rows[count].HeaderCell.Value = string.Format((count + 1).ToString(), "0");
            }

            return;
        }

        private void dgvSearchResult_CellContentClick(object sender, DataGridViewCellEventArgs e)
        {
            if ((e.RowIndex >= 0) && (e.ColumnIndex == girUrlColumnIdx))
            {
                DataGridViewButtonCell clickedButtonCell = (DataGridViewButtonCell)dgvSearchResult.Rows[e.RowIndex].Cells[e.ColumnIndex];
                //MessageBox.Show(clickedButtonCell.Value.ToString() + clickedButtonCell.Tag.ToString());
                System.Diagnostics.Process.Start(clickedButtonCell.Tag.ToString()); 
            }
        }
        
        private void releaseObject(object obj)
        {
            try
            {
                System.Runtime.InteropServices.Marshal.ReleaseComObject(obj);
                obj = null;
            }
            catch (Exception ex)
            {
                obj = null;
                MessageBox.Show("Exception Occured while releasing object " + ex.ToString());
            }
            finally
            {
                GC.Collect();
            }
        }

        private void btnSaveAll_Click(object sender, EventArgs e)
        {
            Excel.Application xlApp = new Excel.Application();
            Excel.Workbook xlWorkBook;
            Excel.Worksheet xlWorkSheet;

            object misValue = System.Reflection.Missing.Value;
            xlApp = new Excel.ApplicationClass();
            xlWorkBook = xlApp.Workbooks.Add(misValue);
            xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1);
            int i = 0;
            int j = 0;

            //save header
            for (i = 0; i <= dgvSearchResult.ColumnCount - 1; i++)
            {
                xlWorkSheet.Cells[0+1, i+1] = dgvSearchResult.Columns[i].HeaderText;
            }

            //save cells
            for (i = 0; i <= dgvSearchResult.RowCount - 1; i++)
            {
                for (j = 0; j <= dgvSearchResult.ColumnCount - 1; j++)
                {
                    DataGridViewCell cell = dgvSearchResult[j, i];
                    if (j == girUrlColumnIdx)
                    {
                        xlWorkSheet.Cells[i + 2, j + 1] = cell.Tag.ToString();
                    }
                    else
                    {
                        xlWorkSheet.Cells[i + 2, j + 1] = cell.Value;
                    }
                }
            }

            //formatting
            //header to bold
            Range headerRow = xlWorkSheet.get_Range("1:1", System.Type.Missing);
            headerRow.Font.Bold = true;

            string outputFilename = "fiverrComScrapedResult.xls";
            string fullFilename = Path.Combine(getSaveFolder(), outputFilename);
            //xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, misValue, misValue, misValue, misValue, misValue);
            xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, XlSaveConflictResolution.xlLocalSessionChanges, misValue, misValue, misValue, misValue);
            xlWorkBook.Close(true, misValue, misValue);
            xlApp.Quit();

            releaseObject(xlWorkSheet);
            releaseObject(xlWorkBook);
            releaseObject(xlApp);

            openFolderAndSelectFile(fullFilename);
        }

        private void openFolderAndSelectFile(string fullFilename)
        {
            System.Diagnostics.Process.Start("Explorer.exe", "/select," + fullFilename);
        }

        private string getSaveFolder()
        {
            string saveFolderPath = System.Environment.CurrentDirectory;
            //fbdSaveFolder.SelectedPath = System.Environment.CurrentDirectory;
            if (fbdSaveFolder.ShowDialog() == System.Windows.Forms.DialogResult.OK)
            {
                saveFolderPath = fbdSaveFolder.SelectedPath;
            }

            return saveFolderPath;
        }

        private void btnExportToCsv_Click(object sender, EventArgs e)
        {
            //settings
            //string delimiter = "|";
            string delimiter = ",";

            string outputFilename = "fiverrComScrapedResult.csv";
            string fullFilename = Path.Combine(getSaveFolder(), outputFilename);

            StreamWriter csvStreamWriter = new StreamWriter(fullFilename, false, System.Text.Encoding.UTF8);

            //output header data
            string strHeader = "";
            for (int i = 0; i < dgvSearchResult.Columns.Count; i++)
            {
                strHeader += dgvSearchResult.Columns[i].HeaderText + delimiter;
            }
            csvStreamWriter.WriteLine(strHeader);

            //output rows data
            for (int j = 0; j < dgvSearchResult.Rows.Count; j++)
            {
                string strRowValue = "";

                for (int k = 0; k < dgvSearchResult.Columns.Count; k++)
                {
                    if (k == girUrlColumnIdx)
                    {
                        strRowValue += dgvSearchResult.Rows[j].Cells[k].Tag.ToString() + delimiter;
                    }
                    else
                    {
                        strRowValue += dgvSearchResult.Rows[j].Cells[k].Value + delimiter;
                    }
                }
                csvStreamWriter.WriteLine(strRowValue);
            }

            csvStreamWriter.Close();

            //after save file
            openFolderAndSelectFile(fullFilename);
        }

        private void clearSearchResult()
        {
            dgvSearchResult.Rows.Clear();
        }

        private void btnClearAll_Click(object sender, EventArgs e)
        {
            clearSearchResult();
        }

        private void btnHelp_Click(object sender, EventArgs e)
        {
            string helpUrl = "http://giggladiator.com/help";
            System.Diagnostics.Process.Start(helpUrl); 
        }

    }

}

(2)

转载请注明:在路上 » 【代码分享】C#代码:FiverComScraper – 只抓取fiverr.com,网站改版之前

发表我的评论
取消评论

表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址
88 queries in 0.188 seconds, using 20.69MB memory