【背景】
之前写了个C#程序,从Amazon中抓取数据。
此版本是完全从网页中抓取产品信息的。
【ScrapeAmazonProduct代码分享】
1.截图:
2.完整项目代码下载:
ScrapeAmazonProduct_2013-06-11_scrapeFromHtml.zip
3.代码分享:
(1)frmScrapeAmazonProduct.cs
/* * [File] * frmScrapeAmazonProduct.cs * * [Function] * Scrape products data from Amazon * * [Author] * Crifan Li * * [Date] * 2013-06-11 * * [Contact] * https://www.crifan.org/contact_me/ */ using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Text; using System.Windows.Forms; using System.Web; using System.Net; using System.Xml; using System.IO; using HtmlAgilityPack; using System.Text.RegularExpressions; using Excel = Microsoft.Office.Interop.Excel; using Microsoft.Office.Interop.Excel; using NLog; using NLog.Targets; using NLog.Config; namespace ScrapeAmazonProduct { public partial class frmScrapeAmazonProduct : Form { struct AmazonProductInfo { public string url; //record who it is public string title; public string description; //5 bullet public string[] bulletArr; // total 5 (or more, but only record 5) //download 5 pics public string[] imgFullnameArr; // total 5 (or more, but only record 5) //product keyword fileds, up to 3 public string[] keywordFieldArr; //each field, less than 50 chars, seperated by ',' //highest price of total (up to 8) sellers public float highestPrice; public bool isOneSellerIsAmazon; public int reviewNumber; public bool isBestSeller; }; //for debug private int lineNumber = 1; string outputExcelFilename = "AmazonProductInfo.xls"; string constOutputFolderName = "output"; string outputExcelFullFilename = ""; string absOutputFolder = ""; string gLogFilename; public static string constAmazonDomainUrl = "http://www.amazon.com"; public static int rule_minimalBuyerNumber = 8; public static int rule_totalUnitNumber = 50; //check max length for each bullet < 100 (or 90?) public static int rule_maxLenEachBullet = 100; public static float rule_dimensionMaxLengthCm = 80.0F; public static float rule_dimensionMaxWidthCm = 80.0F; public static float rule_dimensionMaxHeightCm = 80.0F; public static int rule_maxSingleKeywordFieldLen = 50; Dictionary<string, string> gMainCatMappingBestSellerCatDict; public crifanLib crl; public crifanLibAmazon amazonLib; List<crifanLibAmazon.categoryItem> mainCategoryList; List<crifanLibAmazon.categoryItem> bestSellerCategoryList; //for log public Logger gLogger = null; public frmScrapeAmazonProduct() { //!!! for load embedded dll: (1) register resovle handler AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(CurrentDomain_AssemblyResolve); crl = new crifanLib(); amazonLib = new crifanLibAmazon(); gMainCatMappingBestSellerCatDict = null; InitializeComponent(); } //!!! for load embedded dll: (2) implement this handler System.Reflection.Assembly CurrentDomain_AssemblyResolve(object sender, ResolveEventArgs args) { string dllName = args.Name.Contains(",") ? args.Name.Substring(0, args.Name.IndexOf(',')) : args.Name.Replace(".dll", ""); dllName = dllName.Replace(".", "_"); if (dllName.EndsWith("_resources")) return null; System.Resources.ResourceManager rm = new System.Resources.ResourceManager(GetType().Namespace + ".Properties.Resources", System.Reflection.Assembly.GetExecutingAssembly()); byte[] bytes = (byte[])rm.GetObject(dllName); return System.Reflection.Assembly.Load(bytes); } private void initSearchCategory() { //http://www.amazon.com/ref=nb_sb_noss_null string regularCategoryMainUrl = "http://www.amazon.com/ref=nb_sb_noss_null"; mainCategoryList = amazonLib.extractMainCategoryList(regularCategoryMainUrl); if ((mainCategoryList != null) && (mainCategoryList.Count > 0)) { //init search category cmbSearchCategory.DataSource = mainCategoryList; cmbSearchCategory.DisplayMember = "name"; } else { gLogger.Fatal("can not find main category list"); } //string bestSellerMainUrl = "http://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_tab"; string bestSellerMainUrl = "http://www.amazon.com/Best-Sellers/zgbs"; bestSellerCategoryList = amazonLib.extractBestSellerCategoryList(bestSellerMainUrl); //gLogger.Trace("=== Main Category Info ==="); //for (int idx = 0; idx < mainCategoryList.Count; idx++) //{ // crifanLibAmazon.categoryItem catItem = mainCategoryList[idx]; // int num = idx + 1; // gLogger.Trace(num.ToString()); // gLogger.Trace("Name:\t" + catItem.Name); // gLogger.Trace("Key:\t" + catItem.Key); // gLogger.Trace("Url:\t" + catItem.Url); //} //gLogger.Trace("=== Best Seller Category Info ==="); //for (int idx = 0; idx < bestSellerCategoryList.Count; idx++) //{ // crifanLibAmazon.categoryItem catItem = bestSellerCategoryList[idx]; // int num = idx + 1; // gLogger.Trace(num.ToString()); // gLogger.Trace("Name:\t" + catItem.Name); // gLogger.Trace("Key:\t" + catItem.Key); // gLogger.Trace("Url:\t" + catItem.Url); //} } private void initMainCategoryToBestSellerCategoryMapping() { gMainCatMappingBestSellerCatDict = new Dictionary<string, string>(); //gmainCatMappingBestSellerCatDict.Add("instant-video", ""); gMainCatMappingBestSellerCatDict.Add("appliances", "appliances"); gMainCatMappingBestSellerCatDict.Add("mobile-apps", "mobile"); gMainCatMappingBestSellerCatDict.Add("arts-crafts", "arts"); gMainCatMappingBestSellerCatDict.Add("automotive", "automotive"); gMainCatMappingBestSellerCatDict.Add("baby-products", "baby"); gMainCatMappingBestSellerCatDict.Add("beauty", "beauty"); gMainCatMappingBestSellerCatDict.Add("stripbooks", "books"); //gmainCatMappingBestSellerCatDict.Add("", "photo"); gMainCatMappingBestSellerCatDict.Add("mobile", "wireless"); gMainCatMappingBestSellerCatDict.Add("apparel", "apparel"); //gmainCatMappingBestSellerCatDict.Add("collectibles", ""); gMainCatMappingBestSellerCatDict.Add("computers", "pc"); //gmainCatMappingBestSellerCatDict.Add("financial", ""); gMainCatMappingBestSellerCatDict.Add("electronics", "electronics"); gMainCatMappingBestSellerCatDict.Add("gift-cards", "gift"); gMainCatMappingBestSellerCatDict.Add("grocery", "grocery"); gMainCatMappingBestSellerCatDict.Add("hpc", "hpc"); gMainCatMappingBestSellerCatDict.Add("garden", "home"); //gmainCatMappingBestSellerCatDict.Add("", "hi"); gMainCatMappingBestSellerCatDict.Add("industrial", "industrial"); gMainCatMappingBestSellerCatDict.Add("jewelry", "jewelry"); gMainCatMappingBestSellerCatDict.Add("digital-text", "digital"); //gmainCatMappingBestSellerCatDict.Add("", "kitchen"); //gmainCatMappingBestSellerCatDict.Add("", "dmusic"); gMainCatMappingBestSellerCatDict.Add("magazines", "magazines"); gMainCatMappingBestSellerCatDict.Add("movies-tv", "movies"); gMainCatMappingBestSellerCatDict.Add("digital-music", "dmusic");//MP3 Music gMainCatMappingBestSellerCatDict.Add("popular", "music"); //Music gMainCatMappingBestSellerCatDict.Add("mi", "musical"); //Musical Instruments gMainCatMappingBestSellerCatDict.Add("office-products", "office"); gMainCatMappingBestSellerCatDict.Add("lawngarden", "lawn"); gMainCatMappingBestSellerCatDict.Add("pets", "pet"); gMainCatMappingBestSellerCatDict.Add("shoes", "shoes"); gMainCatMappingBestSellerCatDict.Add("software", "software"); gMainCatMappingBestSellerCatDict.Add("sporting", "sporting"); gMainCatMappingBestSellerCatDict.Add("tools", "hi"); //Tools & Home Improvement -> Home Improvement gMainCatMappingBestSellerCatDict.Add("toys-and-games", "toys"); gMainCatMappingBestSellerCatDict.Add("videogames", "videogames"); gMainCatMappingBestSellerCatDict.Add("watches", "watches"); } private void initLogger() { //logger = LogManager.GetCurrentClassLogger(); // Step 1. Create configuration object LoggingConfiguration logConfig = new LoggingConfiguration(); // Step 2. Create targets and add them to the configuration RichTextBoxTarget rtbTarget = new RichTextBoxTarget(); logConfig.AddTarget("richTextBox", rtbTarget); rtbTarget.FormName = "frmScrapeAmazonProduct"; // your winform class name rtbTarget.ControlName = "rtbLog"; // your RichTextBox control/variable name FileTarget fileTarget = new FileTarget(); logConfig.AddTarget("logFile", fileTarget); // Step 3. Set target properties //string commonLayout = "${date:format=yyyy-MM-dd HH\\:mm\\:ss} ${logger} ${message}"; //https://github.com/nlog/nlog/wiki/Layout-renderers //https://github.com/nlog/nlog/wiki/Level-Layout-Renderer //string commonLayout = "[${date:format=yyyy-MM-dd HH\\:mm\\:ss}][${level}] ${message}"; string commonLayout = "[${date:format=yyyy-MM-dd HH\\:mm\\:ss}][${pad:padding=5:inner=${level:uppercase=true}}] ${message}"; rtbTarget.Layout = commonLayout; //fileTarget.FileName = "${basedir}/output/log.txt"; //{'${basedir}/output/log.txt'} fileTarget.FileName = gLogFilename; //{'D:\tmp\tmp_dev_root\freelance\elance\40939187_scrape_amazon\40939187_scrape_amazon\ScrapeAmazonProduct\ScrapeAmazonProduct\bin\Debug\output\2013-06-11_153647_log.txt'} fileTarget.Layout = commonLayout; // Step 4. Define rules LoggingRule ruleRichTextBox = new LoggingRule("*", LogLevel.Info, rtbTarget); logConfig.LoggingRules.Add(ruleRichTextBox); LoggingRule ruleFile = new LoggingRule("*", LogLevel.Trace, fileTarget); logConfig.LoggingRules.Add(ruleFile); // Step 5. Activate the configuration LogManager.Configuration = logConfig; // Example usage //Logger logger = LogManager.GetLogger("Amazon"); //Logger logger = LogManager.GetLogger(""); gLogger = LogManager.GetLogger(""); //gLogger.Trace("trace log message"); //gLogger.Debug("debug log message"); //gLogger.Info("info log message"); //gLogger.Warn("warn log message"); //gLogger.Error("error log message"); //gLogger.Fatal("fatal log message"); } private void frmScrapeAmazonProduct_Load(object sender, EventArgs e) { //1. init output directory absOutputFolder = Path.Combine(Environment.CurrentDirectory, constOutputFolderName); if (!Directory.Exists(absOutputFolder)) { Directory.CreateDirectory(absOutputFolder); } outputExcelFullFilename = Path.Combine(absOutputFolder, outputExcelFilename); //2. init log filename //string curDatetimeStr = DateTime.Now.ToString(); DateTime curDateTime = DateTime.Now; string curDatetimeStr = String.Format("{0:yyyy-MM-dd_HHmmss}", curDateTime); //"2013-06-11_142102" gLogFilename = curDatetimeStr + "_log.txt"; //"2013-06-11_153647_log.txt" gLogFilename = Path.Combine(absOutputFolder, gLogFilename); //{'D:\tmp\tmp_dev_root\freelance\elance\40939187_scrape_amazon\40939187_scrape_amazon\ScrapeAmazonProduct\ScrapeAmazonProduct\bin\Debug\output\2013-06-11_153647_log.txt'} //3. init logger initLogger(); //4. init main category list to best seller mapping initMainCategoryToBestSellerCategoryMapping(); //5. init main category list initSearchCategory(); } //private void print(string info) //{ // rtbLog.Text = rtbLog.Text + info + Environment.NewLine; // System.Windows.Forms.Application.DoEvents(); //} //private void log(string info) //{ // rtbLog.Text = "[" + lineNumber.ToString() + "]" // + info // + Environment.NewLine // + rtbLog.Text; // lineNumber++; // System.Windows.Forms.Application.DoEvents(); //} private bool checkBuyerNumber(string productHtml, out string invalidReason, out string usedAndNewUrl) { bool isBuyerNumberValid = false; invalidReason = "Unknow error for checkBuyerNumber"; usedAndNewUrl = ""; int buyerNumber = 0; if (amazonLib.extractProductBuyerNumberAndNewUrl(productHtml, out buyerNumber, out usedAndNewUrl)) { if (buyerNumber > rule_minimalBuyerNumber) { isBuyerNumberValid = true; invalidReason = ""; } else { isBuyerNumberValid = false; invalidReason = String.Format("Buyer Number is {0}, less than {1}", buyerNumber, rule_minimalBuyerNumber); } } else { isBuyerNumberValid = false; invalidReason = "Not found buyer number string and used and new url"; } return isBuyerNumberValid; } //http://www.amazon.com/gp/offer-listing/B0083PWAPW/ref=dp_olp_all_mbc?ie=UTF8&condition=all private bool checkTotalUnitNumber(string productUrl, out string invalidReason) { //debug //productUrl = "http://www.amazon.com/gp/offer-listing/B0083PWAPW/ref=dp_olp_all_mbc?ie=UTF8&condition=all"; //productUrl = "http://www.amazon.com/gp/offer-listing/B007HUUU6A/ref=dp_olp_new_mbc?ie=UTF8&condition=new"; bool isTotal50UnitNum = false; invalidReason = "Unknow error for checkTotalUnitNumber"; HtmlAgilityPack.HtmlDocument htmlDoc = null; int totalNumber = 0; //string respHtml = crl.getUrlRespHtml(productUrl); string respHtml = crl.getUrlRespHtml_multiTry(productUrl); /* <form method="POST" action="/gp/item-dispatch/ref=olp_atc_used_1" > <input type="hidden" name="session-id" value="178-3505985-4680803"> <input type="hidden" name="qid" value=""> <input type="hidden" name="sr" value=""> <input id="signInToHUC" type="hidden" value="0" name="signInToHUC"> <input type="hidden" name="metric-asin.1616550414" value="1"> <input type="hidden" name="registryItemID.1" value=""> <input type="hidden" name="registryID.1" value=""> <input type="hidden" name="itemCount" value="1"> <input type="hidden" name="offeringID.1" value="n5Z1VzKW%2FMw90LVK2m6qQO9cxMOctYK3JMq7ea8RaqXkBZUh4WAfxc0emP1KFayuYhqKmk7KlUt9mqIcvgBck9UP4MtWJC1ZvH527IITG0IwujyCrGlxnA6WVgb02eM2avyXLkUpdRFYiUvksTQTqT87qNDk6mMo"> <input type="hidden" name="isAddon" value="0"> <input type="image" src="http://g-ecx.images-amazon.com/images/G/01/x-locale/nav2/images/add-to-cart-md-p._V192250398_.gif" align="absmiddle" alt="Add to cart" border="0" height="21" name="submit.addToCart" width="112"/> </form> */ htmlDoc = crl.htmlToHtmlDoc(respHtml); HtmlNodeCollection postItemNodeList = htmlDoc.DocumentNode.SelectNodes("//form[starts-with(@action, '/gp/item-dispatch/ref=') and @method='POST']"); if (postItemNodeList == null) { //something error invalidReason = "Can not found /gp/item-dispatch post item"; } else { foreach (HtmlNode postItemNode in postItemNodeList) { //http://www.amazon.com/gp/item-dispatch/ref=olp_atc_used_1 string itemDispatchUrl = postItemNode.Attributes["action"].Value; ///gp/item-dispatch/ref=olp_atc_used_1 itemDispatchUrl = constAmazonDomainUrl + itemDispatchUrl;//http://www.amazon.com/gp/item-dispatch/ref=olp_atc_used_1 Dictionary<string, string> postDict = new Dictionary<string, string>(); HtmlNodeCollection inputTypeNodeList = postItemNode.SelectNodes(".//input[@type='hidden' and @name and @value]"); //HtmlNodeCollection inputTypeNodeList = postItemNode.SelectNodes(".//input[@type and @name and @value]"); if (inputTypeNodeList == null) { //something error invalidReason = String.Format("Can not find input tag for node: {1}", postItemNode.InnerHtml); break; } else { foreach (HtmlNode inputTypeNode in inputTypeNodeList) { //get each post key and value string postKey = inputTypeNode.Attributes["name"].Value; string postValue = inputTypeNode.Attributes["value"].Value; postDict.Add(postKey, postValue); /* session-id=178-3505985-4680803 &qid= &sr= &signInToHUC=0 &metric-asin.1616550414=1 ®istryItemID.1= ®istryID.1= &itemCount=1 &offeringID.1=n5Z1VzKW%252FMw90LVK2m6qQO9cxMOctYK3JMq7ea8RaqXkBZUh4WAfxc0emP1KFayuYhqKmk7KlUt9mqIcvgBck9UP4MtWJC1ZvH527IITG0IwujyCrGlxnA6WVgb02eM2avyXLkUpdRFYiUvksTQTqT87qNDk6mMo &isAddon=0 &submit.addToCart.x=63 &submit.addToCart.y=7 */ } postDict.Add("submit.addToCart.x", "63"); postDict.Add("submit.addToCart.y", "7"); Dictionary<string, string> headerDict = new Dictionary<string, string>(); headerDict.Add("AllowAutoRedirect", "false"); headerDict.Add("Referer", productUrl); //do POST //no autoredirect HttpWebResponse resp = crl.getUrlResponse(itemDispatchUrl, headerDict, postDict); string viewHtmlUrl = resp.Headers["Location"]; if (viewHtmlUrl != null && viewHtmlUrl != "") { //respHtml = crl.getUrlRespHtml(viewHtmlUrl); respHtml = crl.getUrlRespHtml_multiTry(viewHtmlUrl); //got html: //<div class="hlb-scarcity red">Only 8 left in stock.</div> htmlDoc = crl.htmlToHtmlDoc(respHtml); HtmlNode hlbScarcityNode = htmlDoc.DocumentNode.SelectSingleNode("//div[starts-with(@class, 'hlb-scarcity')]"); if (hlbScarcityNode == null) { //first one is amazon, no hlb-scarcity red //others must have this } else { string leftInStockStr = hlbScarcityNode.InnerText; //Only 1 left in stock. string leftNumberStr = ""; if (crl.extractSingleStr(@"Only (\d+) left in stock", leftInStockStr, out leftNumberStr)) { int leftNumberInt = Int32.Parse(leftNumberStr); //1 totalNumber += leftNumberInt; if (totalNumber > rule_totalUnitNumber) { isTotal50UnitNum = true; invalidReason = "No error"; break; } } else { //something error invalidReason = "Can not find remaining number"; break; } } } else { //something error invalidReason = "Not found viewHtmlUrl"; break; } } } } return isTotal50UnitNum; } private bool checkWeight(string productUrl, string productHtml, out string invalidReason) { bool isLess5Pounds = false; invalidReason = "Unknow error for checkWeight"; float maxKiloGram = 2.5F; float kiloGram = amazonLib.extractProductWeight(productHtml); //check valid or not if (kiloGram > 0.0F) { if (kiloGram <= maxKiloGram) { isLess5Pounds = true; } else { isLess5Pounds = false; invalidReason = String.Format("Weight is {0} kilogram, more than 5 pounds", kiloGram); } } else { isLess5Pounds = false; invalidReason = "Not found weight string or unrecognized weight number"; } return isLess5Pounds; } private bool checkDimension(string productUrl, string productHtml, out string invalidReason) { bool isValidDimension = false; invalidReason = "Unknow error for checkDimension"; crifanLibAmazon.productDimension dimensionCm = amazonLib.extractProductDimension(productHtml); if (dimensionCm.length > 0.0F) { crifanLibAmazon.productDimension dimensionMaxCm = new crifanLibAmazon.productDimension(); dimensionMaxCm.length = rule_dimensionMaxLengthCm; dimensionMaxCm.width = rule_dimensionMaxWidthCm; dimensionMaxCm.height = rule_dimensionMaxHeightCm; //check valid or not if ( (dimensionCm.length <= dimensionMaxCm.length) && (dimensionCm.width <= dimensionMaxCm.width) && (dimensionCm.height <= dimensionMaxCm.height) ) { isValidDimension = true; } else { isValidDimension = false; invalidReason = String.Format("Dimension: {0}cm x {1}cm x {2}cm invalid for exceed max: {3}cm x {4}cm x {5}cm", dimensionCm.length, dimensionCm.width, dimensionCm.height, dimensionMaxCm.length, dimensionMaxCm.width, dimensionMaxCm.height); } } else { //isValidDimension = false; //invalidReason = "Not found dimension string"; isValidDimension = true; // even if no dimension, also consider it as valid one if the weight is valid } return isValidDimension; } private bool checkProductValid(string productUrl, string productHtml, out string invalidReason, out string usedAndNewUrl) { bool isProductValid = true; invalidReason = ""; usedAndNewUrl = ""; //1. check buyer number > 8 if (isProductValid) { //debug isProductValid = checkBuyerNumber(productHtml, out invalidReason, out usedAndNewUrl); } //2. check total unit number > 50 if (isProductValid) { //debug isProductValid = checkTotalUnitNumber(usedAndNewUrl, out invalidReason); } //3. check no more than 5 pounds (2.5 kg) if (isProductValid) { //debug isProductValid = checkWeight(productUrl, productHtml, out invalidReason); } //4. check dimension less than 80cmX80cmX80cm if (isProductValid) { //debug isProductValid = checkDimension(productUrl, productHtml, out invalidReason); } return isProductValid; } public void updateProgress(int percentage) { //pgbDownload.Value = percentage; } public void downloadPictures(string productUrl, string respHtml, out string[] picFullnameList) { picFullnameList = null; //init string productAsin = ""; if (amazonLib.extractAsinFromProductUrl(productUrl, out productAsin)) { } else { //something wrong } //creat folder string downloadRootPath = Path.Combine(absOutputFolder, "download"); string downloadFullPath = Path.Combine(downloadRootPath, productAsin); if (!Directory.Exists(downloadFullPath)) { Directory.CreateDirectory(downloadFullPath); } string[] imageUrlList = amazonLib.extractProductImageList(respHtml); gLogger.Info("Extracted image url list:"); if (imageUrlList != null) { picFullnameList = new string[imageUrlList.Length]; for (int idx = 0; idx < imageUrlList.Length; idx++) { string imageUrl = imageUrlList[idx]; gLogger.Info(String.Format("[{0}]={1}", idx, imageUrl)); string picFilename = crl.extractFilenameFromUrl(imageUrl); string picFullFilename = Path.Combine(downloadFullPath, picFilename); string errorStr = ""; gLogger.Info(String.Format("Downloading {0}] to {1}", imageUrl, picFullFilename)); crl.downloadFile(imageUrl, picFullFilename, out errorStr, updateProgress); //update picFullnameList[idx] = picFullFilename; } } else { gLogger.Error("No image url for " + productUrl); } } /* * productUrl=http://www.amazon.com/Kindle-Paperwhite-Touch-light/dp/B007OZNZG0/ref=lp_1055398_1_1?ie=UTF8&qid=1370510177&sr=1-1 * usedAndNewUrl=http://www.amazon.com/gp/offer-listing/B007OZNZG0/ref=dp_olp_all_mbc?ie=UTF8&condition=all */ private AmazonProductInfo extractProductInfo(string productUrl, string productHtml, string usedAndNewUrl) { gLogger.Info("Extracting info for " + productUrl); //init AmazonProductInfo productInfo = new AmazonProductInfo(); productInfo.url = productUrl; productInfo.highestPrice = 0.0F; productInfo.isOneSellerIsAmazon = false; //must init, otherwise, when only got 4 bullet, here total 5 -> last is null -> assign later will exception productInfo.bulletArr = new string[5]; crl.emptyStringArray(productInfo.bulletArr); productInfo.imgFullnameArr = new string[5]; crl.emptyStringArray(productInfo.imgFullnameArr); productInfo.keywordFieldArr = new string[3]; crl.emptyStringArray(productInfo.keywordFieldArr); //1. title productInfo.title = amazonLib.extractProductTitle(productHtml); gLogger.Info("Title=" + productInfo.title); //2. description and 5 bullet List<string> bulletList = new List<string>(); bool gotBullets = amazonLib.extractProductBulletList(productHtml, out bulletList); gLogger.Info("Extracted Bullets=" + gotBullets); string description = ""; bool gotDescription = amazonLib.extractProductDescription(productHtml, out description); gLogger.Info("Got Description=" + gotDescription); /* * 1. if no description, use bullet * 2. if more than normal 5 bullets, get all bullets, just use first 5 bullets to description * 3. if no bullet, use description to split to 5 bullets */ //type1: has description, has bullet if ((description != "") && (bulletList.Count > 0)) { productInfo.description = description; //bullets //maybe has more than 5 bullets //maybe less than 5 bullets //http://www.amazon.com/AmazonBasics-Lightning-Compatible-Cable-inch/dp/B00B5RGAWY/ref=sr_1_3?s=wireless&ie=UTF8&qid=1369753764&sr=1-3 //has feature-bullets_feature_div, but no content -> bulletsNodeList is null for (int idx = 0; idx < bulletList.Count; idx++) { string bulletStr = bulletList[idx]; //get first 5 -> to bullet if (idx < 5) { productInfo.bulletArr[idx] = bulletStr; } } } //type2: no description, has bullet else if ((description == "") && (bulletList.Count > 0)) { //bullets //maybe has more than 5 bullets //maybe less than 5 bullets for (int idx = 0; idx < bulletList.Count; idx++) { string bulletStr = bulletList[idx]; //get first 5 -> to bullet if (idx < 5) { productInfo.bulletArr[idx] = bulletStr; } //all bullet -> description description = description + bulletStr + Environment.NewLine; } productInfo.description = description; } //type3: has description, no bullet else if ((description != "") && (bulletList.Count == 0)) { productInfo.description = description; //seperate description to many lines string[] lines = description.Split('.'); //maybe less than 5, maybe greater than 5 for (int idx = 0; idx < lines.Length; idx++) { string curLine = lines[idx]; //get first 5 -> to bullet if (idx < 5) { productInfo.bulletArr[idx] = curLine; } } } //type4: no description, no bullet else if ((description == "") && (bulletList.Count == 0)) { //something wrong } //check max length for each bullet for (int idx = 0; idx < productInfo.bulletArr.Length; idx++) { if (productInfo.bulletArr[idx].Length > rule_maxLenEachBullet) { productInfo.bulletArr[idx] = productInfo.bulletArr[idx].Substring(0, rule_maxLenEachBullet); } } //check max length for whole description ? //3. download 5(or 7) pics string[] picFullnameList = null; //debug downloadPictures(productUrl, productHtml, out picFullnameList); if((picFullnameList != null) && (picFullnameList.Length > 0)) { int maxImageCount = 0; if(picFullnameList.Length > productInfo.imgFullnameArr.Length) { maxImageCount = productInfo.imgFullnameArr.Length; } else { maxImageCount = picFullnameList.Length; } for (int idx = 0; idx < maxImageCount; idx++) { productInfo.imgFullnameArr[idx] = picFullnameList[idx]; } } //4.extract product seller info: price and name List<crifanLibAmazon.productSellerInfo> allSellerInfoList = new List<crifanLibAmazon.productSellerInfo>(); if (amazonLib.extractAllSellerInfo(usedAndNewUrl, out allSellerInfoList)) { foreach (crifanLibAmazon.productSellerInfo eachSellerInfo in allSellerInfoList) { //(1) calc highest price if (eachSellerInfo.price > productInfo.highestPrice) { productInfo.highestPrice = eachSellerInfo.price; } //(2) find whether one of the sellers is Amazon //here means: one of the seller's name is: Amazon.com if (eachSellerInfo.name.Equals("Amazon.com", StringComparison.CurrentCultureIgnoreCase)) { productInfo.isOneSellerIsAmazon = true; } } } else { gLogger.Debug("not found seller info for " + usedAndNewUrl); } gLogger.Info("Highest Price=" + productInfo.highestPrice); gLogger.Info("One of Seller is Amazon=" + productInfo.isOneSellerIsAmazon); //5. 3 keyword Field productInfo.keywordFieldArr = amazonLib.extractProductKeywordField(productInfo.title, productInfo.keywordFieldArr.Length, rule_maxSingleKeywordFieldLen); gLogger.Info("Keyword Field List:"); if ((productInfo.keywordFieldArr != null) && (productInfo.keywordFieldArr.Length > 0)) { for (int idx = 0; idx < productInfo.keywordFieldArr.Length; idx++) { String keywordField = productInfo.keywordFieldArr[idx]; gLogger.Info(String.Format("[{0}]={1}", idx, keywordField)); } } //6. product review productInfo.reviewNumber = amazonLib.extractProductReviewNumber(productHtml); gLogger.Info("ReviewNumber=" + productInfo.reviewNumber); //7. product best seller rank number list List<crifanLibAmazon.productBestRank> bestSellerRankList = amazonLib.extractProductBestSellerRankList(productHtml); if ((bestSellerRankList != null) && (bestSellerRankList.Count > 0)) { productInfo.isBestSeller = true; } else { gLogger.Debug("bestSellerRankList is null or count not > 0 : " + bestSellerRankList.ToString()); } gLogger.Info("Is BestSeller=" + productInfo.isBestSeller); return productInfo; } private void createOutputFile(string excelFullFilename) { gLogger.Info("Creating ouput file " + excelFullFilename); bool isAutoFit = true; bool isHeaderBold = true; //init //if exist remove it if (File.Exists(excelFullFilename)) { File.Delete(excelFullFilename); } Excel.Application xlApp = new Excel.Application(); Excel.Workbook xlWorkBook; Excel.Worksheet xlWorkSheet; object misValue = System.Reflection.Missing.Value; xlApp = new Excel.ApplicationClass(); xlWorkBook = xlApp.Workbooks.Add(misValue); xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1); const int excelRowHeader = 1; const int excelColumnHeader = 1; //save header int curColumnIdx = 0 + excelColumnHeader; int rowIdx = 0 + excelRowHeader; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "Title"; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "Description"; const int constBullerLen = 5; for (int bulletIdx = 0; bulletIdx < constBullerLen; bulletIdx++) { int bulletNum = bulletIdx + 1; xlWorkSheet.Cells[rowIdx, curColumnIdx + bulletIdx] = "Bullet" + bulletNum.ToString(); } curColumnIdx = curColumnIdx + constBullerLen; const int constImgNameListLen = 5; for (int imgIdx = 0; imgIdx < constImgNameListLen; imgIdx++) { int imgNum = imgIdx + 1; xlWorkSheet.Cells[rowIdx, curColumnIdx + imgIdx] = "ImageFilename" + imgNum.ToString(); } curColumnIdx = curColumnIdx + constImgNameListLen; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "HighestPrice"; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "OneSellerIsAmazon"; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "ReviewNumber"; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "IsBestSeller"; //formatting //(1) header to bold if (isHeaderBold) { Range headerRow = xlWorkSheet.get_Range("1:1", System.Type.Missing); headerRow.Font.Bold = true; } //(2) auto adjust column width (according to content) if (isAutoFit) { Range allColumn = xlWorkSheet.Columns; allColumn.AutoFit(); } //output xlWorkBook.SaveAs(excelFullFilename, XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, XlSaveAsAccessMode.xlExclusive, XlSaveConflictResolution.xlLocalSessionChanges, misValue, misValue, misValue, misValue); xlWorkBook.Close(true, misValue, misValue); xlApp.Quit(); crl.releaseObject(xlWorkSheet); crl.releaseObject(xlWorkBook); crl.releaseObject(xlApp); } private void appendInfoToFile(string fullFilename, AmazonProductInfo productInfo) { gLogger.Info("Saving product info for " + productInfo.url); Excel.Application xlApp; Excel.Workbook xlWorkBook; Excel.Worksheet xlWorkSheet; object missingVal = System.Reflection.Missing.Value; xlApp = new Microsoft.Office.Interop.Excel.Application(); //xlApp.Visible = true; //xlApp.DisplayAlerts = false; //http://msdn.microsoft.com/zh-cn/library/microsoft.office.interop.excel.workbooks.open%28v=office.11%29.aspx xlWorkBook = xlApp.Workbooks.Open( Filename : fullFilename, //UpdateLinks:3, ReadOnly : false, //Format : 2, //use Commas as delimiter when open text file //Password : missingVal, //WriteResPassword : missingVal, //IgnoreReadOnlyRecommended: false, //when save to readonly, will notice you Origin: Excel.XlPlatform.xlWindows, //xlMacintosh/xlWindows/xlMSDOS //Delimiter: ",", // usefule when is text file Editable : true, Notify : false, //Converter: missingVal, AddToMru: true, //True to add this workbook to the list of recently used files Local: true, CorruptLoad: missingVal //xlNormalLoad/xlRepairFile/xlExtractData ); //Get the first sheet xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1); //also can get by sheet name Excel.Range range = xlWorkSheet.UsedRange; //int usedColCount = range.Columns.Count; int usedRowCount = range.Rows.Count; const int excelRowHeader = 1; const int excelColumnHeader = 1; //int curColumnIdx = usedColCount + excelColumnHeader; int curColumnIdx = 0 + excelColumnHeader; //start from column begin int curRrowIdx = usedRowCount + excelRowHeader; // !!! here must added buildin excelRowHeader=1, otherwise will overwrite previous (added title or whole row value) xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.title; xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.description; const int constBullerLen = 5; int bulletListLen = 0; if (productInfo.bulletArr.Length > constBullerLen) { bulletListLen = constBullerLen; } else { bulletListLen = productInfo.bulletArr.Length; } for (int bulletIdx = 0; bulletIdx < bulletListLen; bulletIdx++) { xlWorkSheet.Cells[curRrowIdx, curColumnIdx + bulletIdx] = productInfo.bulletArr[bulletIdx]; } curColumnIdx = curColumnIdx + bulletListLen; const int constImgNameListLen = 5; int imgNameListLen = 0; if (productInfo.imgFullnameArr.Length > constImgNameListLen) { imgNameListLen = constImgNameListLen; } else { imgNameListLen = productInfo.imgFullnameArr.Length; } for (int imgIdx = 0; imgIdx < imgNameListLen; imgIdx++) { xlWorkSheet.Cells[curRrowIdx, curColumnIdx + imgIdx] = productInfo.imgFullnameArr[imgIdx]; } curColumnIdx = curColumnIdx + imgNameListLen; xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.highestPrice; xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.isOneSellerIsAmazon; xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.reviewNumber; xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.isBestSeller; ////http://msdn.microsoft.com/query/dev10.query?appId=Dev10IDEF1&l=ZH-CN&k=k%28MICROSOFT.OFFICE.INTEROP.EXCEL._WORKBOOK.SAVEAS%29;k%28SAVEAS%29;k%28TargetFrameworkMoniker-%22.NETFRAMEWORK%2cVERSION%3dV3.5%22%29;k%28DevLang-CSHARP%29&rd=true //xlWorkBook.SaveAs( // Filename: fullFilename, // ConflictResolution: XlSaveConflictResolution.xlLocalSessionChanges //The local user's changes are always accepted. // //FileFormat : Excel.XlFileFormat.xlWorkbookNormal //); //if use above SaveAs -> will popup a window ask you overwrite it or not, even if you have set the ConflictResolution to xlLocalSessionChanges, which should not ask, should directly save xlWorkBook.Save(); //http://msdn.microsoft.com/query/dev10.query?appId=Dev10IDEF1&l=ZH-CN&k=k%28MICROSOFT.OFFICE.INTEROP.EXCEL._WORKBOOK.CLOSE%29;k%28CLOSE%29;k%28TargetFrameworkMoniker-%22.NETFRAMEWORK%2cVERSION%3dV3.5%22%29;k%28DevLang-CSHARP%29&rd=true xlWorkBook.Close(SaveChanges : true); crl.releaseObject(xlWorkSheet); crl.releaseObject(xlWorkBook); crl.releaseObject(xlApp); } //save product info private void saveProductInfo(AmazonProductInfo productInfo) { //check if output excel file already exist if (!File.Exists(outputExcelFullFilename)) { //if no, create it, add header createOutputFile(outputExcelFullFilename); } //then append info to it appendInfoToFile(outputExcelFullFilename, productInfo); return; } //check whether each product valid or not //if valid, extract product info //http://www.amazon.com/Silver-Linings-Playbook/dp/B00CL68QVQ/ref=sr_1_2?s=instant-video&ie=UTF8&qid=1368688342&sr=1-2 private void checkAndExtractForSingleProduct(string productUrl) { //debug //productUrl = "http://www.amazon.com/Paderno-World-Cuisine-A4982799-Tri-Blade/dp/B0007Y9WHQ/ref=lp_1055398_1_3?ie=UTF8&qid=1370596558&sr=1-3"; bool isProductValid = false; string invalidReason = ""; //string respHtml = crl.getUrlRespHtml(productUrl); string productHtml = crl.getUrlRespHtml_multiTry(productUrl); string usedAndNewUrl = ""; isProductValid = checkProductValid(productUrl, productHtml, out invalidReason, out usedAndNewUrl); if (isProductValid) { gLogger.Info("+VALID+ Product=" + productUrl); AmazonProductInfo productInfo = extractProductInfo(productUrl, productHtml, usedAndNewUrl); saveProductInfo(productInfo); } else { gLogger.Info(String.Format("-INVALID- product={0}, reason={1}", productUrl, invalidReason)); } } //check whether each product variation valid or not //if valid, extract product info private void checkAndExtractForSingleVariation(crifanLibAmazon.variationItem singleVariationItem) { bool isProductValid = false; string invalidReason = ""; gLogger.Info("processing variation " + singleVariationItem.url); //string respHtml = crl.getUrlRespHtml(singleVariationItem.url); string productHtml = crl.getUrlRespHtml_multiTry(singleVariationItem.url); string usedAndNewUrl = ""; isProductValid = checkProductValid(singleVariationItem.url, productHtml, out invalidReason, out usedAndNewUrl); if (isProductValid) { gLogger.Info("Valid product=" + singleVariationItem.url); AmazonProductInfo productInfo = extractProductInfo(singleVariationItem.url, productHtml, usedAndNewUrl); //check whether the product title already have vartiation label in the end of title //if not, added it if (productInfo.title.EndsWith(singleVariationItem.label)) { //http://www.amazon.com/GE-MWF-Refrigerator-Filter-1-Pack/dp/B000AST3AK/ref=lp_1055398_1_4?ie=UTF8&qid=1370574186&sr=1-4 //title already added variation label: //GE MWF Refrigerator Water Filter, 1-Pack //also for: //http://www.amazon.com/gp/product/B003BIG0DO/ref=twister_B000AST3AK?ie=UTF8&psc=1 //GE SmartWater MWF Refrigerator Water Filter, 2-Pack } else { //http://www.amazon.com/Thermos-Insulated-18-Ounce-Stainless-Steel-Hydration/dp/B000FJ9DOK/ref=lp_1055398_1_6?ie=UTF8&qid=1370574186&sr=1-6 //and //http://www.amazon.com/gp/product/B0057FQCNC/ref=twister_B000FJ9DOK?ie=UTF8&psc=1 //has same title productInfo.title = productInfo.title + ", " + singleVariationItem.label; } saveProductInfo(productInfo); } else { gLogger.Info(String.Format("Invalid product={0}, reason={1}",singleVariationItem.url, invalidReason)); } } private void processSinglePageHtml(string singlePageHtml) { List<crifanLibAmazon.searchResultItem> searchedItemList = new List<crifanLibAmazon.searchResultItem>(); if (amazonLib.extractSearchItemList(singlePageHtml, out searchedItemList)) { foreach (crifanLibAmazon.searchResultItem eachSearchResultItem in searchedItemList) { crifanLibAmazon.productVariationInfo variationInfo = new crifanLibAmazon.productVariationInfo(); gLogger.Info("processing single product url " + eachSearchResultItem.productUrl); if (amazonLib.checkVariation(eachSearchResultItem.productUrl, out variationInfo)) { //have many varation //process each variation List<crifanLibAmazon.variationItem> variationList = variationInfo.variationList; gLogger.Info(String.Format("Total {0} variations for {1}", variationList.Count, eachSearchResultItem.productUrl)); foreach (crifanLibAmazon.variationItem eachVariationItem in variationList) { checkAndExtractForSingleVariation(eachVariationItem); } } else { //no variation -> only current single product //directly process this product gLogger.Info("no variation for " + eachSearchResultItem.productUrl); checkAndExtractForSingleProduct(eachSearchResultItem.productUrl); } } } } private void processEachSearchCategory(string curPageSearchUrl) { gLogger.Info("processing search category " + curPageSearchUrl); string eachPageHtml = ""; bool hasMorePage = true; //get each page html while (hasMorePage) { //fisrt: //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dinstant-video //then: //http://www.amazon.com/s/ref=sr_pg_2?rh=n%3A2625373011%2Cn%3A%212644981011%2Cn%3A%212644982011%2Cn%3A2858778011&page=2&ie=UTF8&qid=1368697688 //eachPageHtml = crl.getUrlRespHtml(curPageSearchUrl); eachPageHtml = crl.getUrlRespHtml_multiTry(curPageSearchUrl); processSinglePageHtml(eachPageHtml); string nextPageUrl = ""; if (amazonLib.extractNextPageUrl(eachPageHtml, out nextPageUrl)) { if (nextPageUrl != "") { //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dinstant-video#/ref=sr_pg_2?rh=n%3A2858778011&page=2&ie=UTF8&qid=1368688123 //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dinstant-video#/ref=sr_pg_3?rh=n%3A2858778011&page=3&ie=UTF8&qid=1368688123 hasMorePage = true; } else { hasMorePage = false; break; } } else { //something wrong break; } } } //find matched best seller category for input main category item public bool findMatchedBestSellerCategoryItem(crifanLibAmazon.categoryItem mainCateoryItem, out crifanLibAmazon.categoryItem bestSellerCateoryItem) { bool foundMatchedBestSeller = false; bestSellerCateoryItem = new crifanLibAmazon.categoryItem(); //Method 1: static mapping if (gMainCatMappingBestSellerCatDict != null && (gMainCatMappingBestSellerCatDict.Count > 0)) { if (gMainCatMappingBestSellerCatDict.ContainsKey(mainCateoryItem.Key)) { string bestSellerCategoryKey = gMainCatMappingBestSellerCatDict[mainCateoryItem.Key]; foreach (crifanLibAmazon.categoryItem singleBestSellerCatItem in bestSellerCategoryList) { if (bestSellerCategoryKey.Equals(singleBestSellerCatItem.Key, StringComparison.CurrentCultureIgnoreCase)) { bestSellerCateoryItem = singleBestSellerCatItem; foundMatchedBestSeller = true; break; } } } } //Method 2: dynamic find same category key //bestSellerCateoryItem = new crifanLibAmazon.categoryItem(); //foreach (crifanLibAmazon.categoryItem singleBestSellerCatItem in bestSellerCategoryList) //{ // if (mainCateoryItem.Key.Equals(singleBestSellerCatItem.Key, StringComparison.CurrentCultureIgnoreCase)) // { // bestSellerCateoryItem = singleBestSellerCatItem; // foundMatchedBestSeller = true; // break; // } //} ////not found key match //if (!foundMatchedBestSeller) //{ // //check some specials // //(1) // //Main Category : Best Seller // //mobile-apps : mobile // //arts-crafts : arts // //baby-products : baby // //stripbooks : books // //mobile : wireless // //... //} return foundMatchedBestSeller; } private void searchSingleCategory(crifanLibAmazon.categoryItem singleCateoryItem) { //instant-video string curSearchCategoryKey = singleCateoryItem.Key; //1. general category url //instant-video //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dinstant-video string generalCategoryUrl = amazonLib.generateMainCategoryUrlFromCategoryKey(curSearchCategoryKey); processEachSearchCategory(singleCateoryItem.Url); //2. Best Sellers crifanLibAmazon.categoryItem bestSellerCategoryItem; if (findMatchedBestSellerCategoryItem(singleCateoryItem, out bestSellerCategoryItem)) { gLogger.Info("Found corrsponding best seller item category url=" + bestSellerCategoryItem.Url); processEachSearchCategory(bestSellerCategoryItem.Url); } else { gLogger.Info("NOT found corrsponding best seller item category url, for: " + singleCateoryItem.Url); } //3. Movers & Shakers //string moversShakersCategoryUrl = ""; //if(curSearchCategoryKey in moversShakersCategoryList) //processEachSearchCategory(moversShakersCategoryUrl); //4. Top Rated //string topRatedCategoryUrl = ""; //if(curSearchCategoryKey in topRatedCategoryList) //processEachSearchCategory(topRatedCategoryUrl); } private void btnSearch_Click(object sender, EventArgs e) { /* * http://www.amazon.com/Best-Sellers/zgbs/ref=zg_bs_tab * http://www.amazon.com/gp/movers-and-shakers/ref=zg_bs_tab * http://www.amazon.com/gp/top-rated/ref=zg_bs_tab * * */ crifanLibAmazon.categoryItem curSelectedCategory = new crifanLibAmazon.categoryItem(); if (cmbSearchCategory.SelectedIndex >= 0) { //has selected some sub category curSelectedCategory = (crifanLibAmazon.categoryItem)cmbSearchCategory.SelectedItem; searchSingleCategory(curSelectedCategory); } } } }
【总结】
转载请注明:在路上 » 【代码分享】C#代码:ScrapeAmazonProduct – 抓取Amazon产品数据(完全从网页中抓取)