最新消息:20210917 已从crifan.com换到crifan.org

【代码分享】C#代码:ScrapeAmazonProduct – 抓取Amazon产品数据(完全从网页中抓取)

CodeShare crifan 3191浏览 0评论

【背景】

之前写了个C#程序,从Amazon中抓取数据。

此版本是完全从网页中抓取产品信息的。

 

【ScrapeAmazonProduct代码分享】

1.截图:

ScrapeAmazonProduct main ui scrape from html

2.完整项目代码下载:

ScrapeAmazonProduct_2013-06-11_scrapeFromHtml.zip

 

3.代码分享:

(1)frmScrapeAmazonProduct.cs

/*
 * [File]
 * frmScrapeAmazonProduct.cs
 * 
 * [Function]
 * Scrape products data from Amazon
 * 
 * [Author]
 * Crifan Li
 * 
 * [Date]
 * 2013-06-11
 * 
 * [Contact]
 * https://www.crifan.org/contact_me/
 */

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;

using System.Web;
using System.Net;

using System.Xml;
using System.IO;

using HtmlAgilityPack;

using System.Text.RegularExpressions;

using Excel = Microsoft.Office.Interop.Excel;
using Microsoft.Office.Interop.Excel;

using NLog;
using NLog.Targets;
using NLog.Config;

namespace ScrapeAmazonProduct
{
    public partial class frmScrapeAmazonProduct : Form
    {
        struct AmazonProductInfo
        {
            public string url; //record who it is

            public string title;
            public string description;
            //5 bullet
            public string[] bulletArr; // total 5 (or more, but only record 5)
            //download 5 pics
            public string[] imgFullnameArr; // total 5 (or more, but only record 5)
            //product keyword fileds, up to 3
            public string[] keywordFieldArr; //each field, less than 50 chars, seperated by ','
            //highest price of total (up to 8) sellers
            public float highestPrice;
            public bool isOneSellerIsAmazon;
            public int reviewNumber;
            public bool isBestSeller;
        };

        //for debug
        private int lineNumber = 1;

        string outputExcelFilename = "AmazonProductInfo.xls";
        string constOutputFolderName = "output";
        string outputExcelFullFilename = "";
        string absOutputFolder = "";
        string gLogFilename;

        public static string constAmazonDomainUrl = "http://www.amazon.com";

        public static int rule_minimalBuyerNumber = 8;
        public static int rule_totalUnitNumber = 50;
        //check max length for each bullet < 100 (or 90?)
        public static int rule_maxLenEachBullet = 100;
        public static float rule_dimensionMaxLengthCm = 80.0F;
        public static float rule_dimensionMaxWidthCm = 80.0F;
        public static float rule_dimensionMaxHeightCm = 80.0F;
        
        public static int rule_maxSingleKeywordFieldLen = 50;
        
        Dictionary<string, string> gMainCatMappingBestSellerCatDict;
        
        public crifanLib crl;

        public crifanLibAmazon amazonLib;
        List<crifanLibAmazon.categoryItem> mainCategoryList;
        List<crifanLibAmazon.categoryItem> bestSellerCategoryList;

        //for log
        public Logger gLogger = null;

        public frmScrapeAmazonProduct()
        {
            //!!! for load embedded dll: (1) register resovle handler
            AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(CurrentDomain_AssemblyResolve);

            crl = new crifanLib();
            amazonLib = new crifanLibAmazon();
            gMainCatMappingBestSellerCatDict = null;

            InitializeComponent();
        }

        //!!! for load embedded dll: (2) implement this handler
        System.Reflection.Assembly CurrentDomain_AssemblyResolve(object sender, ResolveEventArgs args)
        {
            string dllName = args.Name.Contains(",") ? args.Name.Substring(0, args.Name.IndexOf(',')) : args.Name.Replace(".dll", "");

            dllName = dllName.Replace(".", "_");

            if (dllName.EndsWith("_resources")) return null;

            System.Resources.ResourceManager rm = new System.Resources.ResourceManager(GetType().Namespace + ".Properties.Resources", System.Reflection.Assembly.GetExecutingAssembly());

            byte[] bytes = (byte[])rm.GetObject(dllName);

            return System.Reflection.Assembly.Load(bytes);
        }
        
        private void initSearchCategory()
        {
            //http://www.amazon.com/ref=nb_sb_noss_null
            string regularCategoryMainUrl = "http://www.amazon.com/ref=nb_sb_noss_null";
            mainCategoryList = amazonLib.extractMainCategoryList(regularCategoryMainUrl);

            if ((mainCategoryList != null) && (mainCategoryList.Count > 0))
            {
                //init search category
                cmbSearchCategory.DataSource = mainCategoryList;
                cmbSearchCategory.DisplayMember = "name";
            }
            else
            {
                gLogger.Fatal("can not find main category list");
            }

            //string bestSellerMainUrl = "http://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_tab";
            string bestSellerMainUrl = "http://www.amazon.com/Best-Sellers/zgbs";
            bestSellerCategoryList = amazonLib.extractBestSellerCategoryList(bestSellerMainUrl);

            //gLogger.Trace("=== Main Category Info ===");
            //for (int idx = 0; idx < mainCategoryList.Count; idx++)
            //{
            //    crifanLibAmazon.categoryItem catItem = mainCategoryList[idx];
            //    int num = idx + 1;
            //    gLogger.Trace(num.ToString());
            //    gLogger.Trace("Name:\t" + catItem.Name);
            //    gLogger.Trace("Key:\t" + catItem.Key);
            //    gLogger.Trace("Url:\t" + catItem.Url);
            //}

            //gLogger.Trace("=== Best Seller Category Info ===");
            //for (int idx = 0; idx < bestSellerCategoryList.Count; idx++)
            //{
            //    crifanLibAmazon.categoryItem catItem = bestSellerCategoryList[idx];
            //    int num = idx + 1;
            //    gLogger.Trace(num.ToString());
            //    gLogger.Trace("Name:\t" + catItem.Name);
            //    gLogger.Trace("Key:\t" + catItem.Key);
            //    gLogger.Trace("Url:\t" + catItem.Url);
            //}
        }


        private void initMainCategoryToBestSellerCategoryMapping()
        {
            gMainCatMappingBestSellerCatDict = new Dictionary<string, string>();
            //gmainCatMappingBestSellerCatDict.Add("instant-video", "");
            gMainCatMappingBestSellerCatDict.Add("appliances", "appliances");
            gMainCatMappingBestSellerCatDict.Add("mobile-apps", "mobile");
            gMainCatMappingBestSellerCatDict.Add("arts-crafts", "arts");
            gMainCatMappingBestSellerCatDict.Add("automotive", "automotive");
            gMainCatMappingBestSellerCatDict.Add("baby-products", "baby");
            gMainCatMappingBestSellerCatDict.Add("beauty", "beauty");
            gMainCatMappingBestSellerCatDict.Add("stripbooks", "books");
            //gmainCatMappingBestSellerCatDict.Add("", "photo");
            gMainCatMappingBestSellerCatDict.Add("mobile", "wireless");
            gMainCatMappingBestSellerCatDict.Add("apparel", "apparel");
            //gmainCatMappingBestSellerCatDict.Add("collectibles", "");
            gMainCatMappingBestSellerCatDict.Add("computers", "pc");
            //gmainCatMappingBestSellerCatDict.Add("financial", "");
            gMainCatMappingBestSellerCatDict.Add("electronics", "electronics");
            gMainCatMappingBestSellerCatDict.Add("gift-cards", "gift");
            gMainCatMappingBestSellerCatDict.Add("grocery", "grocery");
            gMainCatMappingBestSellerCatDict.Add("hpc", "hpc");
            gMainCatMappingBestSellerCatDict.Add("garden", "home");
            //gmainCatMappingBestSellerCatDict.Add("", "hi");
            gMainCatMappingBestSellerCatDict.Add("industrial", "industrial");
            gMainCatMappingBestSellerCatDict.Add("jewelry", "jewelry");
            gMainCatMappingBestSellerCatDict.Add("digital-text", "digital");
            //gmainCatMappingBestSellerCatDict.Add("", "kitchen");
            //gmainCatMappingBestSellerCatDict.Add("", "dmusic");
            gMainCatMappingBestSellerCatDict.Add("magazines", "magazines");
            gMainCatMappingBestSellerCatDict.Add("movies-tv", "movies");
            gMainCatMappingBestSellerCatDict.Add("digital-music", "dmusic");//MP3 Music
            gMainCatMappingBestSellerCatDict.Add("popular", "music"); //Music
            gMainCatMappingBestSellerCatDict.Add("mi", "musical"); //Musical Instruments
            gMainCatMappingBestSellerCatDict.Add("office-products", "office");
            gMainCatMappingBestSellerCatDict.Add("lawngarden", "lawn");
            gMainCatMappingBestSellerCatDict.Add("pets", "pet");
            gMainCatMappingBestSellerCatDict.Add("shoes", "shoes");
            gMainCatMappingBestSellerCatDict.Add("software", "software");
            gMainCatMappingBestSellerCatDict.Add("sporting", "sporting");
            gMainCatMappingBestSellerCatDict.Add("tools", "hi"); //Tools & Home Improvement -> Home Improvement
            gMainCatMappingBestSellerCatDict.Add("toys-and-games", "toys");
            gMainCatMappingBestSellerCatDict.Add("videogames", "videogames");
            gMainCatMappingBestSellerCatDict.Add("watches", "watches");
        }

        private void initLogger()
        {
            //logger = LogManager.GetCurrentClassLogger();

            // Step 1. Create configuration object 
            LoggingConfiguration logConfig = new LoggingConfiguration();

            // Step 2. Create targets and add them to the configuration 
            RichTextBoxTarget rtbTarget = new RichTextBoxTarget();
            logConfig.AddTarget("richTextBox", rtbTarget);
            rtbTarget.FormName = "frmScrapeAmazonProduct"; // your winform class name
            rtbTarget.ControlName = "rtbLog"; // your RichTextBox control/variable name

            FileTarget fileTarget = new FileTarget();
            logConfig.AddTarget("logFile", fileTarget);

            // Step 3. Set target properties
            //string commonLayout = "${date:format=yyyy-MM-dd HH\\:mm\\:ss} ${logger} ${message}";
            //https://github.com/nlog/nlog/wiki/Layout-renderers
            //https://github.com/nlog/nlog/wiki/Level-Layout-Renderer
            //string commonLayout = "[${date:format=yyyy-MM-dd HH\\:mm\\:ss}][${level}] ${message}";
            string commonLayout = "[${date:format=yyyy-MM-dd HH\\:mm\\:ss}][${pad:padding=5:inner=${level:uppercase=true}}] ${message}";
            
            rtbTarget.Layout = commonLayout;
            
            //fileTarget.FileName = "${basedir}/output/log.txt"; //{'${basedir}/output/log.txt'}
            fileTarget.FileName = gLogFilename; //{'D:\tmp\tmp_dev_root\freelance\elance\40939187_scrape_amazon\40939187_scrape_amazon\ScrapeAmazonProduct\ScrapeAmazonProduct\bin\Debug\output\2013-06-11_153647_log.txt'}
            fileTarget.Layout = commonLayout;
            
            // Step 4. Define rules
            LoggingRule ruleRichTextBox = new LoggingRule("*", LogLevel.Info, rtbTarget);
            logConfig.LoggingRules.Add(ruleRichTextBox);
            
            LoggingRule ruleFile = new LoggingRule("*", LogLevel.Trace, fileTarget);
            logConfig.LoggingRules.Add(ruleFile);

            // Step 5. Activate the configuration
            LogManager.Configuration = logConfig;

            // Example usage
            //Logger logger = LogManager.GetLogger("Amazon");
            //Logger logger = LogManager.GetLogger("");
            gLogger = LogManager.GetLogger("");
            //gLogger.Trace("trace log message");
            //gLogger.Debug("debug log message");
            //gLogger.Info("info log message");
            //gLogger.Warn("warn log message");
            //gLogger.Error("error log message");
            //gLogger.Fatal("fatal log message");
        }
        
        private void frmScrapeAmazonProduct_Load(object sender, EventArgs e)
        {
            //1. init output directory
            absOutputFolder = Path.Combine(Environment.CurrentDirectory, constOutputFolderName);
            if (!Directory.Exists(absOutputFolder))
            {
                Directory.CreateDirectory(absOutputFolder);
            }
            outputExcelFullFilename = Path.Combine(absOutputFolder, outputExcelFilename);
            
            //2. init log filename
            //string curDatetimeStr = DateTime.Now.ToString();
            DateTime curDateTime = DateTime.Now;
            string curDatetimeStr = String.Format("{0:yyyy-MM-dd_HHmmss}", curDateTime); //"2013-06-11_142102"
            gLogFilename = curDatetimeStr + "_log.txt"; //"2013-06-11_153647_log.txt"
            gLogFilename = Path.Combine(absOutputFolder, gLogFilename); //{'D:\tmp\tmp_dev_root\freelance\elance\40939187_scrape_amazon\40939187_scrape_amazon\ScrapeAmazonProduct\ScrapeAmazonProduct\bin\Debug\output\2013-06-11_153647_log.txt'}

            //3. init logger
            initLogger();

            //4. init main category list to best seller mapping
            initMainCategoryToBestSellerCategoryMapping();

            //5. init main category list
            initSearchCategory();
        }

        //private void print(string info)
        //{
        //    rtbLog.Text = rtbLog.Text + info + Environment.NewLine;

        //    System.Windows.Forms.Application.DoEvents();
        //}

        //private void log(string info)
        //{
        //    rtbLog.Text = "[" + lineNumber.ToString() + "]"
        //        + info
        //        + Environment.NewLine
        //        + rtbLog.Text;
        //    lineNumber++;

        //    System.Windows.Forms.Application.DoEvents();
        //}

        private bool checkBuyerNumber(string productHtml, out string invalidReason, out string usedAndNewUrl)
        {
            bool isBuyerNumberValid = false;
            invalidReason = "Unknow error for checkBuyerNumber";
            usedAndNewUrl = "";

            int buyerNumber = 0;
            if (amazonLib.extractProductBuyerNumberAndNewUrl(productHtml, out buyerNumber, out usedAndNewUrl))
            {
                if (buyerNumber > rule_minimalBuyerNumber)
                {
                    isBuyerNumberValid = true;
                    invalidReason = "";
                }
                else
                {
                    isBuyerNumberValid = false;
                    invalidReason = String.Format("Buyer Number is {0}, less than {1}", buyerNumber, rule_minimalBuyerNumber);
                }
            }
            else
            {
                isBuyerNumberValid = false;
                invalidReason = "Not found buyer number string and used and new url";
            }

            return isBuyerNumberValid;
        }
        
        //http://www.amazon.com/gp/offer-listing/B0083PWAPW/ref=dp_olp_all_mbc?ie=UTF8&condition=all
        private bool checkTotalUnitNumber(string productUrl, out string invalidReason)
        {
            //debug
            //productUrl = "http://www.amazon.com/gp/offer-listing/B0083PWAPW/ref=dp_olp_all_mbc?ie=UTF8&condition=all";
            //productUrl = "http://www.amazon.com/gp/offer-listing/B007HUUU6A/ref=dp_olp_new_mbc?ie=UTF8&condition=new";
            
            bool isTotal50UnitNum = false;
            invalidReason = "Unknow error for checkTotalUnitNumber";
            
            HtmlAgilityPack.HtmlDocument htmlDoc = null;
            int totalNumber = 0;

            //string respHtml = crl.getUrlRespHtml(productUrl);
            string respHtml = crl.getUrlRespHtml_multiTry(productUrl);

            /*
                <form method="POST" action="/gp/item-dispatch/ref=olp_atc_used_1" >
                    <input type="hidden" name="session-id" value="178-3505985-4680803">
                    <input type="hidden" name="qid" value="">
                    <input type="hidden" name="sr" value="">
                    <input id="signInToHUC" type="hidden" value="0" name="signInToHUC">
                    <input type="hidden" name="metric-asin.1616550414" value="1">
                    <input type="hidden" name="registryItemID.1" value="">
                    <input type="hidden" name="registryID.1" value="">
                    <input type="hidden" name="itemCount" value="1">
                    <input type="hidden" name="offeringID.1" value="n5Z1VzKW%2FMw90LVK2m6qQO9cxMOctYK3JMq7ea8RaqXkBZUh4WAfxc0emP1KFayuYhqKmk7KlUt9mqIcvgBck9UP4MtWJC1ZvH527IITG0IwujyCrGlxnA6WVgb02eM2avyXLkUpdRFYiUvksTQTqT87qNDk6mMo">
                    <input type="hidden" name="isAddon" value="0">
                    <input type="image" src="http://g-ecx.images-amazon.com/images/G/01/x-locale/nav2/images/add-to-cart-md-p._V192250398_.gif" align="absmiddle" alt="Add to cart" border="0" height="21" name="submit.addToCart" width="112"/>
                </form>
             */


            htmlDoc = crl.htmlToHtmlDoc(respHtml);
            HtmlNodeCollection postItemNodeList = htmlDoc.DocumentNode.SelectNodes("//form[starts-with(@action, '/gp/item-dispatch/ref=') and @method='POST']");
            if (postItemNodeList == null)
            {
                //something error
                invalidReason = "Can not found /gp/item-dispatch post item";
            }
            else
            {
                foreach (HtmlNode postItemNode in postItemNodeList)
                {
                    //http://www.amazon.com/gp/item-dispatch/ref=olp_atc_used_1
                    string itemDispatchUrl = postItemNode.Attributes["action"].Value; ///gp/item-dispatch/ref=olp_atc_used_1
                    itemDispatchUrl = constAmazonDomainUrl + itemDispatchUrl;//http://www.amazon.com/gp/item-dispatch/ref=olp_atc_used_1

                    Dictionary<string, string> postDict = new Dictionary<string, string>();

                    HtmlNodeCollection inputTypeNodeList = postItemNode.SelectNodes(".//input[@type='hidden' and @name and @value]");
                    //HtmlNodeCollection inputTypeNodeList = postItemNode.SelectNodes(".//input[@type and @name and @value]");
                    if (inputTypeNodeList == null)
                    {
                        //something error
                        invalidReason = String.Format("Can not find input tag for node: {1}", postItemNode.InnerHtml);
                        break;
                    }
                    else
                    {
                        foreach (HtmlNode inputTypeNode in inputTypeNodeList)
                        {
                            //get each post key and value
                            string postKey = inputTypeNode.Attributes["name"].Value;
                            string postValue = inputTypeNode.Attributes["value"].Value;
                            postDict.Add(postKey, postValue);

                            /*
                                session-id=178-3505985-4680803
                                &qid=
                                &sr=
                                &signInToHUC=0
                                &metric-asin.1616550414=1
                                &registryItemID.1=
                                &registryID.1=
                                &itemCount=1
                                &offeringID.1=n5Z1VzKW%252FMw90LVK2m6qQO9cxMOctYK3JMq7ea8RaqXkBZUh4WAfxc0emP1KFayuYhqKmk7KlUt9mqIcvgBck9UP4MtWJC1ZvH527IITG0IwujyCrGlxnA6WVgb02eM2avyXLkUpdRFYiUvksTQTqT87qNDk6mMo
                                &isAddon=0
                                &submit.addToCart.x=63
                                &submit.addToCart.y=7
                             */
                        }
                        postDict.Add("submit.addToCart.x", "63");
                        postDict.Add("submit.addToCart.y", "7");


                        Dictionary<string, string> headerDict = new Dictionary<string, string>();
                        headerDict.Add("AllowAutoRedirect", "false");
                        headerDict.Add("Referer", productUrl);

                        //do POST
                        //no autoredirect
                        HttpWebResponse resp = crl.getUrlResponse(itemDispatchUrl, headerDict, postDict);

                        string viewHtmlUrl = resp.Headers["Location"];
                        if (viewHtmlUrl != null && viewHtmlUrl != "")
                        {
                            //respHtml = crl.getUrlRespHtml(viewHtmlUrl);
                            respHtml = crl.getUrlRespHtml_multiTry(viewHtmlUrl);

                            //got html:
                            //<div class="hlb-scarcity red">Only 8 left in stock.</div>
                            htmlDoc = crl.htmlToHtmlDoc(respHtml);
                            HtmlNode hlbScarcityNode = htmlDoc.DocumentNode.SelectSingleNode("//div[starts-with(@class, 'hlb-scarcity')]");
                            if (hlbScarcityNode == null)
                            {
                                //first one is amazon, no hlb-scarcity red
                                //others must have this
                            }
                            else
                            {
                                string leftInStockStr = hlbScarcityNode.InnerText; //Only 1 left in stock.
                                string leftNumberStr = "";
                                if (crl.extractSingleStr(@"Only (\d+) left in stock", leftInStockStr, out leftNumberStr))
                                {
                                    int leftNumberInt = Int32.Parse(leftNumberStr); //1
                                    totalNumber += leftNumberInt;

                                    if (totalNumber > rule_totalUnitNumber)
                                    {
                                        isTotal50UnitNum = true;
                                        invalidReason = "No error";
                                        break;
                                    }
                                }
                                else
                                {
                                    //something error
                                    invalidReason = "Can not find remaining number";
                                    break;
                                }
                            }
                        }
                        else
                        {
                            //something error
                            invalidReason = "Not found viewHtmlUrl";
                            break;
                        }
                    }
                }
            }

            return isTotal50UnitNum;
        }
                
        private bool checkWeight(string productUrl, string productHtml, out string invalidReason)
        {
            bool isLess5Pounds = false;
            invalidReason = "Unknow error for checkWeight";

            float maxKiloGram = 2.5F;

            float kiloGram = amazonLib.extractProductWeight(productHtml);
            //check valid or not
            if (kiloGram > 0.0F)
            {
                if (kiloGram <= maxKiloGram)
                {
                    isLess5Pounds = true;
                }
                else
                {
                    isLess5Pounds = false;
                    invalidReason = String.Format("Weight is {0} kilogram, more than 5 pounds", kiloGram);
                }
            }
            else
            {
                isLess5Pounds = false;
                invalidReason = "Not found weight string or unrecognized weight number";
            }

            return isLess5Pounds;
        }

        private bool checkDimension(string productUrl, string productHtml, out string invalidReason)
        {
            bool isValidDimension = false;
            invalidReason = "Unknow error for checkDimension";

            crifanLibAmazon.productDimension dimensionCm = amazonLib.extractProductDimension(productHtml);
            if (dimensionCm.length > 0.0F)
            {
                crifanLibAmazon.productDimension dimensionMaxCm = new crifanLibAmazon.productDimension();
                dimensionMaxCm.length = rule_dimensionMaxLengthCm;
                dimensionMaxCm.width = rule_dimensionMaxWidthCm;
                dimensionMaxCm.height = rule_dimensionMaxHeightCm;

                //check valid or not
                if (
                    (dimensionCm.length <= dimensionMaxCm.length) &&
                    (dimensionCm.width <= dimensionMaxCm.width) &&
                    (dimensionCm.height <= dimensionMaxCm.height)
                    )
                {
                    isValidDimension = true;
                }
                else
                {
                    isValidDimension = false;
                    invalidReason = String.Format("Dimension: {0}cm x {1}cm x {2}cm invalid for exceed max: {3}cm x {4}cm x {5}cm",
                        dimensionCm.length,     dimensionCm.width,      dimensionCm.height,
                        dimensionMaxCm.length,  dimensionMaxCm.width,   dimensionMaxCm.height);
                }
            }
            else 
            {
                //isValidDimension = false;
                //invalidReason = "Not found dimension string";

                isValidDimension = true; // even if no dimension, also consider it as valid one if the weight is valid
            }
            
            return isValidDimension;
        }

        private bool checkProductValid(string productUrl, string productHtml, out string invalidReason, out string usedAndNewUrl)
        {           
            bool isProductValid = true;
            invalidReason = "";

            usedAndNewUrl = "";

            //1. check buyer number > 8
            if (isProductValid)
            {
                //debug
                isProductValid = checkBuyerNumber(productHtml, out invalidReason, out usedAndNewUrl);
            }

            //2. check total unit number > 50
            if (isProductValid)
            {
                //debug
                isProductValid = checkTotalUnitNumber(usedAndNewUrl, out invalidReason);
            }
            
            //3. check no more than 5 pounds (2.5 kg)
            if (isProductValid)
            {
                //debug
                isProductValid = checkWeight(productUrl, productHtml, out invalidReason);
            }

            //4. check dimension less than 80cmX80cmX80cm
            if (isProductValid)
            {
                //debug
                isProductValid = checkDimension(productUrl, productHtml, out invalidReason);
            }

            return isProductValid;
        }

        public void updateProgress(int percentage)
        {
            //pgbDownload.Value = percentage;
        }

        public void downloadPictures(string productUrl, string respHtml, out string[] picFullnameList)
        {
            picFullnameList = null;

            //init
            string productAsin = "";
            if (amazonLib.extractAsinFromProductUrl(productUrl, out productAsin))
            {

            }
            else
            {
                //something wrong 
            }

            //creat folder
            string downloadRootPath = Path.Combine(absOutputFolder, "download");
            string downloadFullPath = Path.Combine(downloadRootPath, productAsin);
            if (!Directory.Exists(downloadFullPath))
            {
                Directory.CreateDirectory(downloadFullPath);
            }
            
            string[] imageUrlList = amazonLib.extractProductImageList(respHtml);
            gLogger.Info("Extracted image url list:");
            if (imageUrlList != null)
            {
                picFullnameList = new string[imageUrlList.Length];
                for (int idx = 0; idx < imageUrlList.Length; idx++)
                {
                    string imageUrl = imageUrlList[idx];
                    gLogger.Info(String.Format("[{0}]={1}", idx, imageUrl));

                    string picFilename = crl.extractFilenameFromUrl(imageUrl);

                    string picFullFilename = Path.Combine(downloadFullPath, picFilename);
                    string errorStr = "";
                    gLogger.Info(String.Format("Downloading {0}] to {1}", imageUrl, picFullFilename));
                    crl.downloadFile(imageUrl, picFullFilename, out errorStr, updateProgress);

                    //update
                    picFullnameList[idx] = picFullFilename;
                }
            }
            else
            {
                gLogger.Error("No image url for " + productUrl);
            }
        }

        /*
         * productUrl=http://www.amazon.com/Kindle-Paperwhite-Touch-light/dp/B007OZNZG0/ref=lp_1055398_1_1?ie=UTF8&qid=1370510177&sr=1-1
         * usedAndNewUrl=http://www.amazon.com/gp/offer-listing/B007OZNZG0/ref=dp_olp_all_mbc?ie=UTF8&condition=all
         */
        private AmazonProductInfo extractProductInfo(string productUrl, string productHtml, string usedAndNewUrl)
        {
            gLogger.Info("Extracting info for " + productUrl);

            //init
            AmazonProductInfo productInfo = new AmazonProductInfo();
            productInfo.url = productUrl;
            productInfo.highestPrice = 0.0F;
            productInfo.isOneSellerIsAmazon = false;

            //must init, otherwise, when only got 4 bullet, here total 5 -> last is null -> assign later will exception
            productInfo.bulletArr = new string[5];
            crl.emptyStringArray(productInfo.bulletArr);
            productInfo.imgFullnameArr = new string[5];
            crl.emptyStringArray(productInfo.imgFullnameArr);
            productInfo.keywordFieldArr = new string[3];
            crl.emptyStringArray(productInfo.keywordFieldArr);

            //1. title
            productInfo.title = amazonLib.extractProductTitle(productHtml);
            gLogger.Info("Title=" + productInfo.title);

            //2. description and 5 bullet
            List<string> bulletList = new List<string>();
            bool gotBullets = amazonLib.extractProductBulletList(productHtml, out bulletList);
            gLogger.Info("Extracted Bullets=" + gotBullets);

            string description = "";
            bool gotDescription = amazonLib.extractProductDescription(productHtml, out description);
            gLogger.Info("Got Description=" + gotDescription);

            /*
              * 1. if no description, use bullet
              * 2. if more than normal 5 bullets, get all bullets, just use first 5 bullets to description
              * 3. if no bullet, use description to split to 5 bullets
              */
            
            //type1: has description, has bullet
            if ((description != "") && (bulletList.Count > 0))
            {
                productInfo.description = description;

                //bullets
                //maybe has more than 5 bullets
                //maybe less than 5 bullets
                //http://www.amazon.com/AmazonBasics-Lightning-Compatible-Cable-inch/dp/B00B5RGAWY/ref=sr_1_3?s=wireless&ie=UTF8&qid=1369753764&sr=1-3
                //has feature-bullets_feature_div, but no content -> bulletsNodeList is null

                for (int idx = 0; idx < bulletList.Count; idx++)
                {
                    string bulletStr = bulletList[idx];

                    //get first 5 -> to bullet
                    if (idx < 5)
                    {
                        productInfo.bulletArr[idx] = bulletStr;
                    }
                }
            }
            //type2: no description, has bullet
            else if ((description == "") && (bulletList.Count > 0))
            {
                //bullets
                //maybe has more than 5 bullets
                //maybe less than 5 bullets
                for (int idx = 0; idx < bulletList.Count; idx++)
                {
                    string bulletStr = bulletList[idx];

                    //get first 5 -> to bullet
                    if (idx < 5)
                    {
                        productInfo.bulletArr[idx] = bulletStr;
                    }

                    //all bullet -> description
                    description = description + bulletStr + Environment.NewLine;
                }

                productInfo.description = description;
            }
            //type3: has description, no bullet
            else if ((description != "") && (bulletList.Count == 0))
            {
                productInfo.description = description;

                //seperate description to many lines
                string[] lines = description.Split('.');

                //maybe less than 5, maybe greater than 5
                for (int idx = 0; idx < lines.Length; idx++)
                {
                    string curLine = lines[idx];

                    //get first 5 -> to bullet
                    if (idx < 5)
                    {
                        productInfo.bulletArr[idx] = curLine;
                    }
                }
            }
            //type4: no description, no bullet
            else if ((description == "") && (bulletList.Count == 0))
            {
                //something wrong
            }

            //check max length for each bullet
            for (int idx = 0; idx < productInfo.bulletArr.Length; idx++)
            {
                if (productInfo.bulletArr[idx].Length > rule_maxLenEachBullet)
                {
                    productInfo.bulletArr[idx] = productInfo.bulletArr[idx].Substring(0, rule_maxLenEachBullet);
                }
            }
            
            //check max length for whole description ?


            //3. download 5(or 7) pics
            string[] picFullnameList = null;
            //debug
            downloadPictures(productUrl, productHtml, out picFullnameList);
            if((picFullnameList != null) && (picFullnameList.Length > 0))
            {
                int maxImageCount = 0;
                if(picFullnameList.Length > productInfo.imgFullnameArr.Length)
                {
                    maxImageCount = productInfo.imgFullnameArr.Length;
                }
                else
                {
                    maxImageCount = picFullnameList.Length;
                }
                for (int idx = 0; idx < maxImageCount; idx++)
                {
                    productInfo.imgFullnameArr[idx] = picFullnameList[idx];
                }
            }

            //4.extract product seller info: price and name
            List<crifanLibAmazon.productSellerInfo> allSellerInfoList = new List<crifanLibAmazon.productSellerInfo>();
            if (amazonLib.extractAllSellerInfo(usedAndNewUrl, out allSellerInfoList))
            {
                foreach (crifanLibAmazon.productSellerInfo eachSellerInfo in allSellerInfoList)
                {
                    //(1) calc highest price
                    if (eachSellerInfo.price > productInfo.highestPrice)
                    {
                        productInfo.highestPrice = eachSellerInfo.price;
                    }

                    //(2) find whether one of the sellers is Amazon
                    //here means: one of the seller's name is: Amazon.com
                    if (eachSellerInfo.name.Equals("Amazon.com", StringComparison.CurrentCultureIgnoreCase))
                    {
                        productInfo.isOneSellerIsAmazon = true;
                    }
                }
            }
            else
            {
                gLogger.Debug("not found seller info for " + usedAndNewUrl);
            }
            gLogger.Info("Highest Price=" + productInfo.highestPrice);
            gLogger.Info("One of Seller is Amazon=" + productInfo.isOneSellerIsAmazon);

            //5. 3 keyword Field
            productInfo.keywordFieldArr = amazonLib.extractProductKeywordField(productInfo.title, productInfo.keywordFieldArr.Length, rule_maxSingleKeywordFieldLen);
            gLogger.Info("Keyword Field List:");
            if ((productInfo.keywordFieldArr != null) && (productInfo.keywordFieldArr.Length > 0))
            {
                for (int idx = 0; idx < productInfo.keywordFieldArr.Length; idx++)
                {
                    String keywordField  = productInfo.keywordFieldArr[idx];
                    gLogger.Info(String.Format("[{0}]={1}", idx, keywordField));
                }
            }
            
            //6. product review
            productInfo.reviewNumber = amazonLib.extractProductReviewNumber(productHtml);
            gLogger.Info("ReviewNumber=" + productInfo.reviewNumber);

            //7. product best seller rank number list
            List<crifanLibAmazon.productBestRank> bestSellerRankList = amazonLib.extractProductBestSellerRankList(productHtml);
            if ((bestSellerRankList != null) && (bestSellerRankList.Count > 0))
            {
                productInfo.isBestSeller = true;
            }
            else
            {
                gLogger.Debug("bestSellerRankList is null or count not > 0 : " + bestSellerRankList.ToString());
            }
            gLogger.Info("Is BestSeller=" + productInfo.isBestSeller);

            return productInfo;
        }

        private void createOutputFile(string excelFullFilename)
        {
            gLogger.Info("Creating ouput file " + excelFullFilename);

            bool isAutoFit = true;
            bool isHeaderBold = true;
            
            //init
            //if exist remove it
            if (File.Exists(excelFullFilename))
            {
                File.Delete(excelFullFilename);
            }

            Excel.Application xlApp = new Excel.Application();
            Excel.Workbook xlWorkBook;
            Excel.Worksheet xlWorkSheet;

            object misValue = System.Reflection.Missing.Value;
            xlApp = new Excel.ApplicationClass();
            xlWorkBook = xlApp.Workbooks.Add(misValue);
            xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1);

            const int excelRowHeader = 1;
            const int excelColumnHeader = 1;
            
            //save header
            int curColumnIdx = 0 + excelColumnHeader;
            int rowIdx = 0 + excelRowHeader;

            xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "Title";
            xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "Description";
            const int constBullerLen = 5;
            for (int bulletIdx = 0; bulletIdx < constBullerLen; bulletIdx++)
            {
                int bulletNum = bulletIdx + 1;
                xlWorkSheet.Cells[rowIdx, curColumnIdx + bulletIdx] = "Bullet" + bulletNum.ToString();
            }
            curColumnIdx = curColumnIdx + constBullerLen;
            const int constImgNameListLen = 5;
            for (int imgIdx = 0; imgIdx < constImgNameListLen; imgIdx++)
            {
                int imgNum = imgIdx + 1;
                xlWorkSheet.Cells[rowIdx, curColumnIdx + imgIdx] = "ImageFilename" + imgNum.ToString();
            }
            curColumnIdx = curColumnIdx + constImgNameListLen;
            xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "HighestPrice";
            xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "OneSellerIsAmazon";
            xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "ReviewNumber";
            xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "IsBestSeller";

            //formatting
            //(1) header to bold
            if (isHeaderBold)
            {
                Range headerRow = xlWorkSheet.get_Range("1:1", System.Type.Missing);
                headerRow.Font.Bold = true;
            }
            //(2) auto adjust column width (according to content)
            if (isAutoFit)
            {
                Range allColumn = xlWorkSheet.Columns;
                allColumn.AutoFit();
            }

            //output
            xlWorkBook.SaveAs(excelFullFilename,
                                XlFileFormat.xlWorkbookNormal,
                                misValue,
                                misValue,
                                misValue,
                                misValue,
                                XlSaveAsAccessMode.xlExclusive,
                                XlSaveConflictResolution.xlLocalSessionChanges,
                                misValue,
                                misValue,
                                misValue,
                                misValue);
            xlWorkBook.Close(true, misValue, misValue);
            xlApp.Quit();

            crl.releaseObject(xlWorkSheet);
            crl.releaseObject(xlWorkBook);
            crl.releaseObject(xlApp);
        }

        private void appendInfoToFile(string fullFilename, AmazonProductInfo productInfo)
        {
            gLogger.Info("Saving product info for " + productInfo.url);

            Excel.Application xlApp;
            Excel.Workbook xlWorkBook;
            Excel.Worksheet xlWorkSheet;
            object missingVal = System.Reflection.Missing.Value;

            xlApp = new Microsoft.Office.Interop.Excel.Application();
            //xlApp.Visible = true;
            //xlApp.DisplayAlerts = false;

            //http://msdn.microsoft.com/zh-cn/library/microsoft.office.interop.excel.workbooks.open%28v=office.11%29.aspx
            xlWorkBook = xlApp.Workbooks.Open(
                Filename : fullFilename,
                //UpdateLinks:3,
                ReadOnly : false,
                //Format : 2, //use Commas as delimiter when open text file
                //Password : missingVal,
                //WriteResPassword : missingVal,
                //IgnoreReadOnlyRecommended: false, //when save to readonly, will notice you
                Origin: Excel.XlPlatform.xlWindows, //xlMacintosh/xlWindows/xlMSDOS
                //Delimiter: ",",  // usefule when is text file
                Editable : true,
                Notify : false,
                //Converter: missingVal, 
                AddToMru: true, //True to add this workbook to the list of recently used files
                Local: true,
                CorruptLoad: missingVal //xlNormalLoad/xlRepairFile/xlExtractData
                );

            //Get the first sheet
            xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1); //also can get by sheet name
            Excel.Range range = xlWorkSheet.UsedRange;
            //int usedColCount = range.Columns.Count;
            int usedRowCount = range.Rows.Count;

            const int excelRowHeader = 1;
            const int excelColumnHeader = 1;

            //int curColumnIdx = usedColCount + excelColumnHeader;
            int curColumnIdx = 0 + excelColumnHeader; //start from column begin
            int curRrowIdx = usedRowCount + excelRowHeader; // !!! here must added buildin excelRowHeader=1, otherwise will overwrite previous (added title or whole row value)

            xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.title;
            xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.description;

            const int constBullerLen = 5;
            int bulletListLen = 0;
            if (productInfo.bulletArr.Length > constBullerLen)
            {
                bulletListLen = constBullerLen;
            }
            else
            {
                bulletListLen = productInfo.bulletArr.Length;
            }
            for (int bulletIdx = 0; bulletIdx < bulletListLen; bulletIdx++)
            {
                xlWorkSheet.Cells[curRrowIdx, curColumnIdx + bulletIdx] = productInfo.bulletArr[bulletIdx];
            }
            curColumnIdx = curColumnIdx + bulletListLen;

            const int constImgNameListLen = 5;
            int imgNameListLen = 0;
            if (productInfo.imgFullnameArr.Length > constImgNameListLen)
            {
                imgNameListLen = constImgNameListLen;
            }
            else
            {
                imgNameListLen = productInfo.imgFullnameArr.Length;
            }
            for (int imgIdx = 0; imgIdx < imgNameListLen; imgIdx++)
            {
                xlWorkSheet.Cells[curRrowIdx, curColumnIdx + imgIdx] = productInfo.imgFullnameArr[imgIdx];
            }
            curColumnIdx = curColumnIdx + imgNameListLen;

            xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.highestPrice;
            xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.isOneSellerIsAmazon;
            xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.reviewNumber;
            xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.isBestSeller;

            ////http://msdn.microsoft.com/query/dev10.query?appId=Dev10IDEF1&l=ZH-CN&k=k%28MICROSOFT.OFFICE.INTEROP.EXCEL._WORKBOOK.SAVEAS%29;k%28SAVEAS%29;k%28TargetFrameworkMoniker-%22.NETFRAMEWORK%2cVERSION%3dV3.5%22%29;k%28DevLang-CSHARP%29&rd=true
            //xlWorkBook.SaveAs(
            //    Filename: fullFilename,
            //    ConflictResolution: XlSaveConflictResolution.xlLocalSessionChanges //The local user's changes are always accepted. 
            //    //FileFormat : Excel.XlFileFormat.xlWorkbookNormal
            //);

            //if use above SaveAs -> will popup a window ask you overwrite it or not, even if you have set the ConflictResolution to xlLocalSessionChanges, which should not ask, should directly save
            xlWorkBook.Save();

            //http://msdn.microsoft.com/query/dev10.query?appId=Dev10IDEF1&l=ZH-CN&k=k%28MICROSOFT.OFFICE.INTEROP.EXCEL._WORKBOOK.CLOSE%29;k%28CLOSE%29;k%28TargetFrameworkMoniker-%22.NETFRAMEWORK%2cVERSION%3dV3.5%22%29;k%28DevLang-CSHARP%29&rd=true
            xlWorkBook.Close(SaveChanges : true);

            crl.releaseObject(xlWorkSheet);
            crl.releaseObject(xlWorkBook);
            crl.releaseObject(xlApp);
        }

        //save product info
        private void saveProductInfo(AmazonProductInfo productInfo)
        {
            //check if output excel file already exist
            if (!File.Exists(outputExcelFullFilename))
            {
                //if no, create it, add header
                createOutputFile(outputExcelFullFilename);
            }

            //then append info to it
            appendInfoToFile(outputExcelFullFilename, productInfo);

            return;
        }

        //check whether each product valid or not
        //if valid, extract product info
        //http://www.amazon.com/Silver-Linings-Playbook/dp/B00CL68QVQ/ref=sr_1_2?s=instant-video&ie=UTF8&qid=1368688342&sr=1-2
        private void checkAndExtractForSingleProduct(string productUrl)
        {
            //debug
            //productUrl = "http://www.amazon.com/Paderno-World-Cuisine-A4982799-Tri-Blade/dp/B0007Y9WHQ/ref=lp_1055398_1_3?ie=UTF8&qid=1370596558&sr=1-3";

            bool isProductValid = false;
            string invalidReason = "";

            //string respHtml = crl.getUrlRespHtml(productUrl);
            string productHtml = crl.getUrlRespHtml_multiTry(productUrl);
            
            string usedAndNewUrl = "";
            isProductValid = checkProductValid(productUrl, productHtml, out invalidReason, out usedAndNewUrl);

            if (isProductValid)
            {
                gLogger.Info("+VALID+ Product=" + productUrl);

                AmazonProductInfo productInfo = extractProductInfo(productUrl, productHtml, usedAndNewUrl);
                saveProductInfo(productInfo);
            }
            else
            {
                gLogger.Info(String.Format("-INVALID- product={0}, reason={1}", productUrl, invalidReason));
            }
        }

        //check whether each product variation valid or not
        //if valid, extract product info
        private void checkAndExtractForSingleVariation(crifanLibAmazon.variationItem singleVariationItem)
        {
            bool isProductValid = false;
            string invalidReason = "";

            gLogger.Info("processing variation " + singleVariationItem.url);

            //string respHtml = crl.getUrlRespHtml(singleVariationItem.url);
            string productHtml = crl.getUrlRespHtml_multiTry(singleVariationItem.url);

            string usedAndNewUrl = "";
            isProductValid = checkProductValid(singleVariationItem.url, productHtml, out invalidReason, out usedAndNewUrl);

            if (isProductValid)
            {
                gLogger.Info("Valid product=" + singleVariationItem.url);

                AmazonProductInfo productInfo = extractProductInfo(singleVariationItem.url, productHtml, usedAndNewUrl);

                //check whether the product title already have vartiation label in the end of title
                //if not, added it
                if (productInfo.title.EndsWith(singleVariationItem.label))
                {
                    //http://www.amazon.com/GE-MWF-Refrigerator-Filter-1-Pack/dp/B000AST3AK/ref=lp_1055398_1_4?ie=UTF8&qid=1370574186&sr=1-4
                    //title already added variation label:
                    //GE MWF Refrigerator Water Filter, 1-Pack
                    //also for:
                    //http://www.amazon.com/gp/product/B003BIG0DO/ref=twister_B000AST3AK?ie=UTF8&psc=1
                    //GE SmartWater MWF Refrigerator Water Filter, 2-Pack

                }
                else
                {
                    //http://www.amazon.com/Thermos-Insulated-18-Ounce-Stainless-Steel-Hydration/dp/B000FJ9DOK/ref=lp_1055398_1_6?ie=UTF8&qid=1370574186&sr=1-6
                    //and
                    //http://www.amazon.com/gp/product/B0057FQCNC/ref=twister_B000FJ9DOK?ie=UTF8&psc=1
                    //has same title
                    productInfo.title = productInfo.title + ", " + singleVariationItem.label;
                }

                saveProductInfo(productInfo);
            }
            else
            {
                gLogger.Info(String.Format("Invalid product={0}, reason={1}",singleVariationItem.url, invalidReason));
            }
        }

        private void processSinglePageHtml(string singlePageHtml)
        {
            List<crifanLibAmazon.searchResultItem> searchedItemList = new List<crifanLibAmazon.searchResultItem>();
            if (amazonLib.extractSearchItemList(singlePageHtml, out searchedItemList))
            {
                foreach (crifanLibAmazon.searchResultItem eachSearchResultItem in searchedItemList)
                {
                    crifanLibAmazon.productVariationInfo variationInfo = new crifanLibAmazon.productVariationInfo();
                    gLogger.Info("processing single product url " + eachSearchResultItem.productUrl);
                    if (amazonLib.checkVariation(eachSearchResultItem.productUrl, out variationInfo))
                    {
                        //have many varation

                        //process each variation
                        List<crifanLibAmazon.variationItem> variationList = variationInfo.variationList;
                        gLogger.Info(String.Format("Total {0} variations for {1}", variationList.Count, eachSearchResultItem.productUrl));

                        foreach (crifanLibAmazon.variationItem eachVariationItem in variationList)
                        {
                            checkAndExtractForSingleVariation(eachVariationItem);
                        }
                    }
                    else
                    {
                        //no variation -> only current single product
                        //directly process this product
                        gLogger.Info("no variation for " + eachSearchResultItem.productUrl);
                        checkAndExtractForSingleProduct(eachSearchResultItem.productUrl);
                    }
                }
            }
        }

        private void processEachSearchCategory(string curPageSearchUrl)
        {
            gLogger.Info("processing search category " + curPageSearchUrl);

            string eachPageHtml = "";

            bool hasMorePage = true;
            //get each page html
            while (hasMorePage)
            {
                //fisrt:
                //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dinstant-video
                //then:
                //http://www.amazon.com/s/ref=sr_pg_2?rh=n%3A2625373011%2Cn%3A%212644981011%2Cn%3A%212644982011%2Cn%3A2858778011&page=2&ie=UTF8&qid=1368697688

                //eachPageHtml = crl.getUrlRespHtml(curPageSearchUrl);
                eachPageHtml = crl.getUrlRespHtml_multiTry(curPageSearchUrl);
                processSinglePageHtml(eachPageHtml);

                string nextPageUrl = "";
                if (amazonLib.extractNextPageUrl(eachPageHtml, out nextPageUrl))
                {
                    if (nextPageUrl != "")
                    {
                        //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dinstant-video#/ref=sr_pg_2?rh=n%3A2858778011&page=2&ie=UTF8&qid=1368688123
                        //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dinstant-video#/ref=sr_pg_3?rh=n%3A2858778011&page=3&ie=UTF8&qid=1368688123

                        hasMorePage = true;
                    }
                    else
                    {
                        hasMorePage = false;
                        break;
                    }
                }
                else
                {
                    //something wrong
                    break;
                }
            }
        }

        //find matched best seller category for input main category item
        public bool findMatchedBestSellerCategoryItem(crifanLibAmazon.categoryItem mainCateoryItem, out crifanLibAmazon.categoryItem bestSellerCateoryItem)
        {
            bool foundMatchedBestSeller = false;
            bestSellerCateoryItem = new crifanLibAmazon.categoryItem();

            //Method 1: static mapping
            if (gMainCatMappingBestSellerCatDict != null && (gMainCatMappingBestSellerCatDict.Count > 0))
            {
                if (gMainCatMappingBestSellerCatDict.ContainsKey(mainCateoryItem.Key))
                {
                    string bestSellerCategoryKey = gMainCatMappingBestSellerCatDict[mainCateoryItem.Key];

                    foreach (crifanLibAmazon.categoryItem singleBestSellerCatItem in bestSellerCategoryList)
                    {
                        if (bestSellerCategoryKey.Equals(singleBestSellerCatItem.Key, StringComparison.CurrentCultureIgnoreCase))
                        {
                            bestSellerCateoryItem = singleBestSellerCatItem;

                            foundMatchedBestSeller = true;
                            break;
                        }
                    }
                }
            }

            //Method 2: dynamic find same category key
            //bestSellerCateoryItem = new crifanLibAmazon.categoryItem();
            //foreach (crifanLibAmazon.categoryItem singleBestSellerCatItem in bestSellerCategoryList)
            //{
            //    if (mainCateoryItem.Key.Equals(singleBestSellerCatItem.Key, StringComparison.CurrentCultureIgnoreCase))
            //    {
            //        bestSellerCateoryItem = singleBestSellerCatItem;

            //        foundMatchedBestSeller = true;
            //        break;
            //    }
            //}

            ////not found key match
            //if (!foundMatchedBestSeller)
            //{
            //    //check some specials
            //    //(1)
            //    //Main Category : Best Seller
            //    //mobile-apps : mobile
            //    //arts-crafts : arts
            //    //baby-products : baby
            //    //stripbooks : books
            //    //mobile : wireless
            //    //...
            //}

            return foundMatchedBestSeller;
        }


        private void searchSingleCategory(crifanLibAmazon.categoryItem singleCateoryItem)
        {
            //instant-video

            string curSearchCategoryKey = singleCateoryItem.Key;
            
            //1. general category url
            //instant-video
            //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dinstant-video
            string generalCategoryUrl = amazonLib.generateMainCategoryUrlFromCategoryKey(curSearchCategoryKey); 
            processEachSearchCategory(singleCateoryItem.Url);

            //2. Best Sellers
            crifanLibAmazon.categoryItem bestSellerCategoryItem;
            if (findMatchedBestSellerCategoryItem(singleCateoryItem, out bestSellerCategoryItem))
            {
                gLogger.Info("Found corrsponding best seller item category url=" + bestSellerCategoryItem.Url);
                processEachSearchCategory(bestSellerCategoryItem.Url);
            }
            else
            {
                gLogger.Info("NOT found corrsponding best seller item category url, for: " + singleCateoryItem.Url);
            }
            
            //3. Movers & Shakers
            //string moversShakersCategoryUrl = "";
            //if(curSearchCategoryKey in moversShakersCategoryList)
            //processEachSearchCategory(moversShakersCategoryUrl);

            //4. Top Rated
            //string topRatedCategoryUrl = "";
            //if(curSearchCategoryKey in topRatedCategoryList)
            //processEachSearchCategory(topRatedCategoryUrl);
        }

        private void btnSearch_Click(object sender, EventArgs e)
        {
            /*
             * http://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_tab
             * http://www.amazon.com/gp/movers-and-shakers/ref=zg_bs_tab
             * http://www.amazon.com/gp/top-rated/ref=zg_bs_tab
             * 
             * */

            crifanLibAmazon.categoryItem curSelectedCategory = new crifanLibAmazon.categoryItem();
            if (cmbSearchCategory.SelectedIndex >= 0)
            {
                //has selected some sub category
                curSelectedCategory = (crifanLibAmazon.categoryItem)cmbSearchCategory.SelectedItem;

                searchSingleCategory(curSelectedCategory);
            }

        }

    }
}

 

 

【总结】

转载请注明:在路上 » 【代码分享】C#代码:ScrapeAmazonProduct – 抓取Amazon产品数据(完全从网页中抓取)

发表我的评论
取消评论

表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址

网友最新评论 (1)

  1. 可能时间久远了,出现下面的错误了。感谢分享啊 [2015-10-21 18:00:39][FATAL] can not find main category list
    berrylover9年前 (2015-10-21)回复
91 queries in 0.196 seconds, using 20.66MB memory