【背景】
之前已经有:
【代码分享】C#代码:ScrapeAmazonProduct – 抓取Amazon产品数据(完全从网页中抓取)
这个是其升级版:
主要改为从AWS的API中抓取数据,其次再从网页中抓取。
【ScrapeAmazonProduct代码分享】
1.截图:
2.项目代码下载:
ScrapeAmazonProduct_2013-09-10_scrapeFromAwsApi.7z
3.代码分享:
(1)frmScrapeAmazonProduct.cs
/* * [File] * frmScrapeAmazonProduct.cs * * [Function] * Scrape products data from Amazon, mainly from AWS API, partially from html * * [Author] * Crifan Li * * [Date] * 2013-09-10 * * [Contact] * https://www.crifan.org/contact_me/ */ using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Text; using System.Windows.Forms; using System.Web; using System.Net; using System.Xml; using System.IO; using HtmlAgilityPack; using System.Text.RegularExpressions; using Excel = Microsoft.Office.Interop.Excel; using Microsoft.Office.Interop.Excel; using NLog; using NLog.Targets; using NLog.Config; namespace ScrapeAmazonProduct { public partial class frmScrapeAmazonProduct : Form { struct AmazonProductInfo { public string url; //record who it is public string title; public string description; //5 bullet public string[] bulletArr; // total 5 (or more, but only record 5) //download 5 pics public string[] imgUrlArr; // total 5 (or more, but only record 5) //product keyword fileds, up to 3 public string[] keywordFieldArr; //each field, less than 50 chars, seperated by ',' //cheapest price of total (up to 8) sellers public float cheapestPrice; public bool isOneSellerIsAmazon; public int reviewNumber; public bool isBestSeller; }; //for debug //private int lineNumber = 1; string defaultOutputFolderName = "output"; string defaultOutputImageFolderName = "images"; string gLogFilename; public static string constAmazonDomainUrl = "http://www.amazon.com"; public static int rule_minimalBuyerNumber; public static int rule_totalUnitNumber; public static int rule_maxLenEachBullet; public static int rule_maxDescriptionLen; public static float rule_dimensionMaxLengthCm; public static float rule_dimensionMaxWidthCm; public static float rule_dimensionMaxHeightCm; public static int rule_maxSingleKeywordFieldLen; public static float rule_maxWeightPounds; Dictionary<string, string> gMainCatMappingBestSellerCatDict; public crifanLib crl; public crifanLibAmazon amazonLib; //List<crifanLibAmazon.categoryItem> mainCategoryList; List<crifanLibAmazon.categoryItem> bestSellerCategoryList; //for log public Logger gLogger = null; //need continue search or not bool needContinueSearch = true; List<TreeNode> curSelTreeNodeList; enum search_status { SEARCH_STATUS_STOPPED, SEARCH_STATUS_SEARCHING, //SEARCH_STATUS_PAUSED }; search_status curSearchStatus = search_status.SEARCH_STATUS_STOPPED; //AWS API public crifanLibAws aws; List<crifanLibAws.awsBrowseNode> gMainBrowserNodeList; List<string> gProcessedAsinList; //makesure all ASIN is Upper Case public int gCurItemNum; public frmScrapeAmazonProduct() { //!!! for load embedded dll: (1) register resovle handler AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(CurrentDomain_AssemblyResolve); crl = new crifanLib(); amazonLib = new crifanLibAmazon(); aws = new crifanLibAws(); //gMainCatMappingBestSellerCatDict = null; //init AWS API string awsAccessKeyId = "your aws access key id"; string awsSecretKey = "your aws secret key"; string awsAssociateTag = "your aws associate tag"; crifanLibAws.awsEndpoint usEndpoint = crifanLibAws.awsEndpoint.US; //note, here evenif you pass into 2011-08-02, but response xmlns still is: //http://webservices.amazon.com/AWSECommerceService/2011-08-01 //so here only use 2011-08-01 string awsApiVersion = "2011-08-01"; aws.initAws(awsAccessKeyId, awsSecretKey, awsAssociateTag, usEndpoint, awsApiVersion); gProcessedAsinList = new List<string>(); gCurItemNum = 1; curSelTreeNodeList = new List<TreeNode>(); InitializeComponent(); } //!!! for load embedded dll: (2) implement this handler System.Reflection.Assembly CurrentDomain_AssemblyResolve(object sender, ResolveEventArgs args) { string dllName = args.Name.Contains(",") ? args.Name.Substring(0, args.Name.IndexOf(',')) : args.Name.Replace(".dll", ""); dllName = dllName.Replace(".", "_"); if (dllName.EndsWith("_resources")) return null; System.Resources.ResourceManager rm = new System.Resources.ResourceManager(GetType().Namespace + ".Properties.Resources", System.Reflection.Assembly.GetExecutingAssembly()); byte[] bytes = (byte[])rm.GetObject(dllName); return System.Reflection.Assembly.Load(bytes); } //update UI according current status private void updateUI() { if (curSearchStatus == search_status.SEARCH_STATUS_STOPPED) { btnSearch.Enabled = true; btnSearch.Text = "Search"; btnStop.Enabled = false; //cmbSearchCategory.Enabled = true; //grbSelectCategory.Enabled = true; grbSettings.Enabled = true; } else if (curSearchStatus == search_status.SEARCH_STATUS_SEARCHING) { btnSearch.Enabled = false; btnSearch.Text = "Searching"; btnStop.Enabled = true; //cmbSearchCategory.Enabled = false; //grbSelectCategory.Enabled = false; grbSettings.Enabled = false; } } private void initLoggerFilename() { string searchCategoryName = ""; crifanLibAws.awsBrowseNode curSelectedBrowserNode = getCurSelBrowserNode(); searchCategoryName = curSelectedBrowserNode.Name; //string curDatetimeStr = DateTime.Now.ToString(); DateTime curDateTime = DateTime.Now; string curDatetimeStr = String.Format("{0:yyyy-MM-dd_HHmmss}", curDateTime); //"2013-06-11_142102" if (!string.IsNullOrEmpty(searchCategoryName)) { gLogFilename = curDatetimeStr + "_log_" + searchCategoryName + ".txt"; //"2013-06-11_153647_log_.txt" } else { gLogFilename = curDatetimeStr + "_log.txt"; //"2013-06-11_153647_log.txt" } gLogFilename = Path.Combine(txbOutputFolder.Text, gLogFilename); //{'D:\tmp\tmp_dev_root\freelance\elance\40939187_scrape_amazon\40939187_scrape_amazon\ScrapeAmazonProduct\ScrapeAmazonProduct\bin\Debug\output\2013-06-11_153647_log.txt'} } private void initLogger() { //logger = LogManager.GetCurrentClassLogger(); // Step 1. Create configuration object LoggingConfiguration logConfig = new LoggingConfiguration(); // Step 2. Create targets and add them to the configuration RichTextBoxTarget rtbTarget = new RichTextBoxTarget(); logConfig.AddTarget("richTextBox", rtbTarget); rtbTarget.FormName = "frmScrapeAmazonProduct"; // your winform class name rtbTarget.ControlName = "rtbLog"; // your RichTextBox control/variable name FileTarget fileTarget = new FileTarget(); logConfig.AddTarget("logFile", fileTarget); // Step 3. Set target properties //string commonLayout = "${date:format=yyyy-MM-dd HH\\:mm\\:ss} ${logger} ${message}"; //https://github.com/nlog/nlog/wiki/Layout-renderers //https://github.com/nlog/nlog/wiki/Level-Layout-Renderer //string commonLayout = "[${date:format=yyyy-MM-dd HH\\:mm\\:ss}][${level}] ${message}"; string commonLayout = "[${date:format=yyyy-MM-dd HH\\:mm\\:ss}][${pad:padding=5:inner=${level:uppercase=true}}] ${message}"; rtbTarget.Layout = commonLayout; //fileTarget.FileName = "${basedir}/output/log.txt"; //{'${basedir}/output/log.txt'} fileTarget.FileName = gLogFilename; //{'D:\tmp\tmp_dev_root\freelance\elance\40939187_scrape_amazon\40939187_scrape_amazon\ScrapeAmazonProduct\ScrapeAmazonProduct\bin\Debug\output\2013-06-11_153647_log.txt'} fileTarget.Layout = commonLayout; // Step 4. Define rules LoggingRule ruleRichTextBox = new LoggingRule("*", LogLevel.Info, rtbTarget); logConfig.LoggingRules.Add(ruleRichTextBox); LoggingRule ruleFile = new LoggingRule("*", LogLevel.Trace, fileTarget); logConfig.LoggingRules.Add(ruleFile); // Step 5. Activate the configuration LogManager.Configuration = logConfig; // Example usage //Logger logger = LogManager.GetLogger("Amazon"); //Logger logger = LogManager.GetLogger(""); gLogger = LogManager.GetLogger(""); //gLogger.Trace("trace log message"); //gLogger.Debug("debug log message"); //gLogger.Info("info log message"); //gLogger.Warn("warn log message"); //gLogger.Error("error log message"); //gLogger.Fatal("fatal log message"); } public void initRules() { rule_minimalBuyerNumber = Int32.Parse(txbMinBuyerNum.Text); rule_totalUnitNumber = Int32.Parse(txbTotalUnitNum.Text); rule_maxLenEachBullet = Int32.Parse(txbEachBulletMaxLen.Text); rule_maxDescriptionLen = Int32.Parse(txbMaxDescriptionLen.Text); rule_dimensionMaxLengthCm = float.Parse(txbDimensionHeight.Text); rule_dimensionMaxWidthCm = float.Parse(txbDimensionWidth.Text); rule_dimensionMaxHeightCm = float.Parse(txbDimensionHeight.Text); rule_maxSingleKeywordFieldLen = Int32.Parse(txbSingleKeywordFieldMaxLen.Text); rule_maxWeightPounds = float.Parse(txbMaxWeightPounds.Text); } public void initOutputRootFolder() { string currentFolder = Environment.CurrentDirectory; string defaultAbsOutputFolder = Path.Combine(currentFolder, defaultOutputFolderName); if (!Directory.Exists(defaultAbsOutputFolder)) { Directory.CreateDirectory(defaultAbsOutputFolder); } txbOutputFolder.Text = defaultAbsOutputFolder; } private string getCurrentOutputFullFilename() { return Path.Combine(txbOutputFolder.Text, txbExcelFilename.Text); } //init log filename //init logger //create ouput image foler private void afterChangeOutputFolder() { //3. init log filename initLoggerFilename(); //4. init logger initLogger(); //5. init output image foler string strOutputImageFolder = Path.Combine(txbOutputFolder.Text, defaultOutputImageFolderName); if (!Directory.Exists(strOutputImageFolder)) { Directory.CreateDirectory(strOutputImageFolder); } } private void frmScrapeAmazonProduct_Load(object sender, EventArgs e) { //1. init rules initRules(); //2. init output initOutputRootFolder(); ////5. init main category list to best seller mapping //initMainCategoryToBestSellerCategoryMapping(); //include init logger afterChangeOutputFolder(); //6. init main category list //initSearchCategory(); //!!! must init logger first initAwsCategory(); //7.update UI updateUI(); ////debug //string testAsin = "B0007S5N8O"; ////crifanLibAws.awsEditorialReview editorialReview = aws.awsGetEditorialReview(testAsin); //crifanLibAws.awsImages imagesInfo = aws.awsGetImages(testAsin); //debug //createOutputFile("D:\\download\\AmazonProductInfo.xls"); ////debug //string itemAsin = "B008D5UG6M"; //crifanLibAws.awsItemAttributes itemAttributes = aws.awsGetItemAttributes(itemAsin); ////debug //string itemAsin = "B004FGMDOQ"; //processAmazonItem(itemAsin); ////debug //string itemAsin = "B0005YWH7A"; //itemAsin = "B001FA1L9I"; //itemAsin = "B003YBJ9KY"; //itemAsin = "B000G1EO6O"; //itemAsin = "B000JSOBSA"; //itemAsin = "B000LKYUSM"; //itemAsin = "B002B8GH74"; //itemAsin = "B000EZYFRA"; //itemAsin = "B000BTAREY"; //itemAsin = "B000JSOBSU"; //itemAsin = "B0029JRTVI"; //itemAsin = "B0005ZW4QI"; //itemAsin = "B0005ZWJ0O"; //itemAsin = "B0095XRL1Y"; //itemAsin = "B001SB1BA8"; //string offerListingUrl = amazonLib.generateOfferListingUrl(itemAsin); //List<crifanLibAmazon.productSellerInfo> allSellerInfoList = new List<crifanLibAmazon.productSellerInfo>(); //amazonLib.extractAllSellerInfo(offerListingUrl, out allSellerInfoList); ////debug //string itemAsin = "B0029A71C4"; //processAmazonItem(itemAsin); } private bool checkBuyerNumber(string productHtml, out string invalidReason, out string usedAndNewUrl) { bool isBuyerNumberValid = false; invalidReason = "Unknow error for checkBuyerNumber"; usedAndNewUrl = ""; int buyerNumber = 0; if (amazonLib.extractProductBuyerNumberAndNewUrl(productHtml, out buyerNumber, out usedAndNewUrl)) { if (buyerNumber > rule_minimalBuyerNumber) { isBuyerNumberValid = true; invalidReason = ""; } else { isBuyerNumberValid = false; invalidReason = String.Format("Buyer Number is {0}, less than {1}", buyerNumber, rule_minimalBuyerNumber); } } else { isBuyerNumberValid = false; invalidReason = "Not found buyer number string and used and new url"; } return isBuyerNumberValid; } //http://www.amazon.com/gp/offer-listing/B0083PWAPW/ref=dp_olp_all_mbc?ie=UTF8&condition=all //"http://www.amazon.com/Frigidaire-FRA052XT7-000-BTU-Window-Conditioner/dp/B003F4TH6G/ref=lp_3737671_1_1?ie=UTF8&qid=1371183851&sr=1-1" //http://www.amazon.com/gp/product/B0009IQXFO/ref=olp_product_details?ie=UTF8 //"http://www.amazon.com/gp/product/B00A49TQPC" private bool checkTotalUnitNumber(string productUrl, out string invalidReason) { //debug //productUrl = "http://www.amazon.com/gp/offer-listing/B0083PWAPW/ref=dp_olp_all_mbc?ie=UTF8&condition=all"; //productUrl = "http://www.amazon.com/gp/offer-listing/B007HUUU6A/ref=dp_olp_new_mbc?ie=UTF8&condition=new"; string strNoError = "No Error"; bool bTotalUnitNumValid = false; //int totalNumber = 0; //invalidReason = "Unknow error for checkTotalUnitNumber"; invalidReason = strNoError; HtmlAgilityPack.HtmlDocument htmlDoc = null; //string respHtml = crl.getUrlRespHtml(productUrl); string respHtml = crl.getUrlRespHtml_multiTry(productUrl); //Method 2: just check the availGreen node //something wrong, so re-check //http://www.amazon.com/Battery-Tender-081-0069-6-Terminal-Disconnect/dp/B004JV6OMO/ref=zg_bs_15719731_80 //Only 2 left in stock. //<div class="buying" style="padding-bottom: 0.75em;"> // <span class="availGreen">Only 2 left in stock.</span> //http://www.amazon.com/Battery-Tender-021-0123-Junior-Charger/dp/B000CITK8S/ref=zg_bs_automotive_3 //In Stock. //<div class="buying" style="padding-bottom: 0.75em;"> // <span class="availGreen">In Stock.</span> htmlDoc = crl.htmlToHtmlDoc(respHtml); HtmlNode availGreenNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@class='buying']/span[@class='availGreen']"); if (availGreenNode == null) { //http://www.amazon.com/gp/product/B005SSWKMK //<div id="availability"> // <div class="a-color-available a-size-medium"> // In Stock. // </div> //</div> HtmlNode availabilityDivNode = htmlDoc.DocumentNode.SelectSingleNode("//div[@id='availability']/div"); availGreenNode = availabilityDivNode; // for latter to check } if (availGreenNode != null) { string strAvailGreen = availGreenNode.InnerText; //" \t\t\t\t\t\tIn Stock.\t\t " strAvailGreen = strAvailGreen.Trim(); //"In Stock." //http://www.amazon.com/gp/product/B0009IQXFO/ref=olp_product_details?ie=UTF8 //"In stock but may require an extra 1-2 days to process." if (strAvailGreen.StartsWith("in stock", StringComparison.CurrentCultureIgnoreCase)) { bTotalUnitNumValid = true; //consider "In Stock." is valid } else { //consider "Only N left in stock." as invalid bTotalUnitNumValid = false; invalidReason = strAvailGreen; gLogger.Debug("availGreen is " + strAvailGreen + " for " + productUrl); } } else { invalidReason = "Can not find 'In Stock.'"; gLogger.Debug(invalidReason + " for " + productUrl); } return bTotalUnitNumValid; } private bool checkWeight(string productUrl, string productHtml, out string invalidReason) { bool bNotExceedWeight = false; invalidReason = "Unknow error for checkWeight"; float maxKiloGram = crl.poundToKiloGram(rule_maxWeightPounds); float kiloGram = amazonLib.extractProductWeight(productHtml); //check valid or not if (kiloGram > 0.0F) { if (kiloGram <= maxKiloGram) { bNotExceedWeight = true; } else { bNotExceedWeight = false; invalidReason = String.Format("Weight is {0} kilogram, more than {1} pounds({2} kilograms)", kiloGram, rule_maxWeightPounds, maxKiloGram); } } else { bNotExceedWeight = false; invalidReason = "Not found weight string or unrecognized weight number"; } return bNotExceedWeight; } private bool checkDimension(string productUrl, string productHtml, out string invalidReason) { bool isValidDimension = false; invalidReason = "Unknow error for checkDimension"; crifanLibAmazon.productDimension dimensionCm = amazonLib.extractProductDimension(productHtml); if (dimensionCm.length > 0.0F) { crifanLibAmazon.productDimension dimensionMaxCm = new crifanLibAmazon.productDimension(); dimensionMaxCm.length = rule_dimensionMaxLengthCm; dimensionMaxCm.width = rule_dimensionMaxWidthCm; dimensionMaxCm.height = rule_dimensionMaxHeightCm; //check valid or not if ( (dimensionCm.length <= dimensionMaxCm.length) && (dimensionCm.width <= dimensionMaxCm.width) && (dimensionCm.height <= dimensionMaxCm.height) ) { isValidDimension = true; } else { isValidDimension = false; invalidReason = String.Format("Dimension: {0}cm x {1}cm x {2}cm invalid for exceed max: {3}cm x {4}cm x {5}cm", dimensionCm.length, dimensionCm.width, dimensionCm.height, dimensionMaxCm.length, dimensionMaxCm.width, dimensionMaxCm.height); } } else { //isValidDimension = false; //invalidReason = "Not found dimension string"; isValidDimension = true; // even if no dimension, also consider it as valid one if the weight is valid } return isValidDimension; } private bool checkProductValid(string productUrl, string productHtml, out string invalidReason, out string usedAndNewUrl) { bool isProductValid = true; invalidReason = ""; usedAndNewUrl = ""; //1. check buyer number > 8 if (isProductValid) { //debug isProductValid = checkBuyerNumber(productHtml, out invalidReason, out usedAndNewUrl); } //2. check total unit number > 50 if (isProductValid) { //debug //isProductValid = checkTotalUnitNumber(usedAndNewUrl, out invalidReason); isProductValid = checkTotalUnitNumber(productUrl, out invalidReason); } //3. check no more than 5 pounds (2.5 kg) if (isProductValid) { //debug isProductValid = checkWeight(productUrl, productHtml, out invalidReason); } //4. check dimension less than 80cmX80cmX80cm if (isProductValid) { //debug isProductValid = checkDimension(productUrl, productHtml, out invalidReason); } return isProductValid; } public void updateProgress(int percentage) { //pgbDownload.Value = percentage; } public void downloadPictures(string productUrl, string respHtml, out string[] picFullnameList) { picFullnameList = null; //init string productAsin = ""; if (amazonLib.extractAsinFromProductUrl(productUrl, out productAsin)) { } else { //something wrong } //creat folder string picFolderFullPath = Path.Combine(txbOutputFolder.Text, productAsin); if (!Directory.Exists(picFolderFullPath)) { Directory.CreateDirectory(picFolderFullPath); } string[] imageUrlList = amazonLib.extractProductImageList(respHtml); gLogger.Info("Extracted image url list:"); if (imageUrlList != null) { picFullnameList = new string[imageUrlList.Length]; for (int idx = 0; idx < imageUrlList.Length; idx++) { string imageUrl = imageUrlList[idx]; gLogger.Info(String.Format("[{0}]={1}", idx, imageUrl)); string picFilename = crl.extractFilenameFromUrl(imageUrl); string picFullFilename = Path.Combine(picFolderFullPath, picFilename); string errorStr = ""; gLogger.Info(String.Format("Downloading {0} to {1}", imageUrl, picFullFilename)); crl.downloadFile(imageUrl, picFullFilename, out errorStr, updateProgress); //update picFullnameList[idx] = picFullFilename; } } else { gLogger.Error("No image url for " + productUrl); } } private void createOutputFile(string excelFullFilename) { gLogger.Info("Creating ouput file " + excelFullFilename); //bool isAutoFit = true; bool isHeaderBold = true; //init //if exist remove it if (File.Exists(excelFullFilename)) { File.Delete(excelFullFilename); } Excel.Application xlApp = new Excel.Application(); Excel.Workbook xlWorkBook; Excel.Worksheet xlWorkSheet; object misValue = System.Reflection.Missing.Value; xlApp = new Excel.ApplicationClass(); xlWorkBook = xlApp.Workbooks.Add(misValue); xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1); const int excelRowHeader = 1; const int excelColumnHeader = 1; //save header int curColumnIdx = 0 + excelColumnHeader; int rowIdx = 0 + excelRowHeader; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "Title"; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "Description"; const int constBullerLen = 5; for (int bulletIdx = 0; bulletIdx < constBullerLen; bulletIdx++) { int bulletNum = bulletIdx + 1; xlWorkSheet.Cells[rowIdx, curColumnIdx + bulletIdx] = "Bullet" + bulletNum.ToString(); } curColumnIdx = curColumnIdx + constBullerLen; const int constImgNameListLen = 5; for (int imgIdx = 0; imgIdx < constImgNameListLen; imgIdx++) { int imgNum = imgIdx + 1; xlWorkSheet.Cells[rowIdx, curColumnIdx + imgIdx] = "ImageFilename" + imgNum.ToString(); } curColumnIdx = curColumnIdx + constImgNameListLen; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "CheapestPrice"; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "OneSellerIsAmazon"; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "ReviewNumber"; xlWorkSheet.Cells[rowIdx, curColumnIdx++] = "IsBestSeller"; //formatting //(1) header to bold if (isHeaderBold) { Range headerRow = xlWorkSheet.get_Range("1:1", System.Type.Missing); headerRow.Font.Bold = true; } //here not autoFit for latter when save into will autofit ////(2) auto adjust column width (according to content) //if (isAutoFit) //{ // Range allColumn = xlWorkSheet.Columns; // allColumn.AutoFit(); //} //output xlWorkBook.SaveAs(excelFullFilename, XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, XlSaveAsAccessMode.xlExclusive, XlSaveConflictResolution.xlLocalSessionChanges, misValue, misValue, misValue, misValue); xlWorkBook.Close(true, misValue, misValue); xlApp.Quit(); crl.releaseObject(xlWorkSheet); crl.releaseObject(xlWorkBook); crl.releaseObject(xlApp); } private void appendInfoToFile(string fullFilename, AmazonProductInfo productInfo) { gLogger.Info("Saving product info for " + productInfo.url); bool isAutoFitForFistColumn = true; Excel.Application xlApp; Excel.Workbook xlWorkBook; Excel.Worksheet xlWorkSheet; object missingVal = System.Reflection.Missing.Value; xlApp = new Microsoft.Office.Interop.Excel.Application(); //xlApp.Visible = true; //xlApp.DisplayAlerts = false; //http://msdn.microsoft.com/zh-cn/library/microsoft.office.interop.excel.workbooks.open%28v=office.11%29.aspx xlWorkBook = xlApp.Workbooks.Open( Filename : fullFilename, //UpdateLinks:3, ReadOnly : false, //Format : 2, //use Commas as delimiter when open text file //Password : missingVal, //WriteResPassword : missingVal, //IgnoreReadOnlyRecommended: false, //when save to readonly, will notice you Origin: Excel.XlPlatform.xlWindows, //xlMacintosh/xlWindows/xlMSDOS //Delimiter: ",", // usefule when is text file Editable : true, Notify : false, //Converter: missingVal, AddToMru: true, //True to add this workbook to the list of recently used files Local: true, CorruptLoad: missingVal //xlNormalLoad/xlRepairFile/xlExtractData ); //Get the first sheet xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1); //also can get by sheet name Excel.Range range = xlWorkSheet.UsedRange; //int usedColCount = range.Columns.Count; int usedRowCount = range.Rows.Count; const int excelRowHeader = 1; const int excelColumnHeader = 1; //int curColumnIdx = usedColCount + excelColumnHeader; int curColumnIdx = 0 + excelColumnHeader; //start from column begin int curRrowIdx = usedRowCount + excelRowHeader; // !!! here must added buildin excelRowHeader=1, otherwise will overwrite previous (added title or whole row value) xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.title; xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.description; const int constBullerLen = 5; int bulletListLen = 0; if (productInfo.bulletArr.Length > constBullerLen) { bulletListLen = constBullerLen; } else { bulletListLen = productInfo.bulletArr.Length; } for (int bulletIdx = 0; bulletIdx < bulletListLen; bulletIdx++) { xlWorkSheet.Cells[curRrowIdx, curColumnIdx + bulletIdx] = productInfo.bulletArr[bulletIdx]; } curColumnIdx = curColumnIdx + bulletListLen; const int constImgNameListLen = 5; int imgNameListLen = 0; if (productInfo.imgUrlArr.Length > constImgNameListLen) { imgNameListLen = constImgNameListLen; } else { imgNameListLen = productInfo.imgUrlArr.Length; } for (int imgIdx = 0; imgIdx < imgNameListLen; imgIdx++) { xlWorkSheet.Cells[curRrowIdx, curColumnIdx + imgIdx] = productInfo.imgUrlArr[imgIdx]; } curColumnIdx = curColumnIdx + imgNameListLen; xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.cheapestPrice; xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.isOneSellerIsAmazon; xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.reviewNumber; xlWorkSheet.Cells[curRrowIdx, curColumnIdx++] = productInfo.isBestSeller; //(2) auto adjust first column width (according to content) if (isAutoFitForFistColumn) { //Range firstColumn = (Range)xlWorkSheet.Columns[0]; Range firstColumn = xlWorkSheet.get_Range("A1"); //firstColumn.AutoFit(); firstColumn.EntireColumn.AutoFit(); } ////http://msdn.microsoft.com/query/dev10.query?appId=Dev10IDEF1&l=ZH-CN&k=k%28MICROSOFT.OFFICE.INTEROP.EXCEL._WORKBOOK.SAVEAS%29;k%28SAVEAS%29;k%28TargetFrameworkMoniker-%22.NETFRAMEWORK%2cVERSION%3dV3.5%22%29;k%28DevLang-CSHARP%29&rd=true //xlWorkBook.SaveAs( // Filename: fullFilename, // ConflictResolution: XlSaveConflictResolution.xlLocalSessionChanges //The local user's changes are always accepted. // //FileFormat : Excel.XlFileFormat.xlWorkbookNormal //); //if use above SaveAs -> will popup a window ask you overwrite it or not, even if you have set the ConflictResolution to xlLocalSessionChanges, which should not ask, should directly save xlWorkBook.Save(); //http://msdn.microsoft.com/query/dev10.query?appId=Dev10IDEF1&l=ZH-CN&k=k%28MICROSOFT.OFFICE.INTEROP.EXCEL._WORKBOOK.CLOSE%29;k%28CLOSE%29;k%28TargetFrameworkMoniker-%22.NETFRAMEWORK%2cVERSION%3dV3.5%22%29;k%28DevLang-CSHARP%29&rd=true xlWorkBook.Close(SaveChanges : true); crl.releaseObject(xlWorkSheet); crl.releaseObject(xlWorkBook); crl.releaseObject(xlApp); } //save product info private void saveProductInfo(AmazonProductInfo productInfo) { string outputExcelFullFilename = Path.Combine(txbOutputFolder.Text, txbExcelFilename.Text); //check if output excel file already exist if (!File.Exists(outputExcelFullFilename)) { //if no, create it, add header createOutputFile(outputExcelFullFilename); } //then append info to it appendInfoToFile(outputExcelFullFilename, productInfo); return; } /* * productUrl=http://www.amazon.com/Kindle-Paperwhite-Touch-light/dp/B007OZNZG0/ref=lp_1055398_1_1?ie=UTF8&qid=1370510177&sr=1-1 * usedAndNewUrl=http://www.amazon.com/gp/offer-listing/B007OZNZG0/ref=dp_olp_all_mbc?ie=UTF8&condition=all */ private bool extractProductInfo(string productUrl, string productHtml, string usedAndNewUrl, out AmazonProductInfo productInfo) { gLogger.Info("Extracting info for " + productUrl); //init bool extractProductInfoOk = true; productInfo = new AmazonProductInfo(); productInfo.url = productUrl; productInfo.cheapestPrice = float.MaxValue; productInfo.isOneSellerIsAmazon = false; //must init, otherwise, when only got 4 bullet, here total 5 -> last is null -> assign later will exception productInfo.bulletArr = new string[5]; crl.emptyStringArray(productInfo.bulletArr); productInfo.imgUrlArr = new string[5]; crl.emptyStringArray(productInfo.imgUrlArr); productInfo.keywordFieldArr = new string[3]; crl.emptyStringArray(productInfo.keywordFieldArr); //1. title productInfo.title = amazonLib.extractProductTitle(productHtml); gLogger.Info("Title=" + productInfo.title); //2. description and 5 bullet List<string> bulletList = new List<string>(); bool gotBullets = amazonLib.extractProductBulletList(productHtml, out bulletList); gLogger.Info("Extracted Bullets=" + gotBullets); string description = ""; bool gotDescription = amazonLib.extractProductDescription(productHtml, out description); gLogger.Info("Got Description=" + gotDescription); /* * 1. if no description, use bullet * 2. if more than normal 5 bullets, get all bullets, just use first 5 bullets to description * 3. if no bullet, use description to split to 5 bullets */ //type1: has description, has bullet if ((description != "") && (bulletList.Count > 0)) { productInfo.description = description; //bullets //maybe has more than 5 bullets //maybe less than 5 bullets //http://www.amazon.com/AmazonBasics-Lightning-Compatible-Cable-inch/dp/B00B5RGAWY/ref=sr_1_3?s=wireless&ie=UTF8&qid=1369753764&sr=1-3 //has feature-bullets_feature_div, but no content -> bulletsNodeList is null for (int idx = 0; idx < bulletList.Count; idx++) { string bulletStr = bulletList[idx]; //get first 5 -> to bullet if (idx < 5) { productInfo.bulletArr[idx] = bulletStr; } } } //type2: no description, has bullet else if ((description == "") && (bulletList.Count > 0)) { //bullets //maybe has more than 5 bullets //maybe less than 5 bullets for (int idx = 0; idx < bulletList.Count; idx++) { string bulletStr = bulletList[idx]; //get first 5 -> to bullet if (idx < 5) { productInfo.bulletArr[idx] = bulletStr; } //all bullet -> description description = description + bulletStr + Environment.NewLine; } productInfo.description = description; } //type3: has description, no bullet else if ((description != "") && (bulletList.Count == 0)) { productInfo.description = description; //seperate description to many lines string[] lines = description.Split('.'); //maybe less than 5, maybe greater than 5 for (int idx = 0; idx < lines.Length; idx++) { string curLine = lines[idx]; //get first 5 -> to bullet if (idx < 5) { productInfo.bulletArr[idx] = curLine; } } } //type4: no description, no bullet else if ((description == "") && (bulletList.Count == 0)) { //something wrong extractProductInfoOk = false; return extractProductInfoOk; } //check max length for each bullet for (int idx = 0; idx < productInfo.bulletArr.Length; idx++) { if (productInfo.bulletArr[idx].Length > rule_maxLenEachBullet) { productInfo.bulletArr[idx] = productInfo.bulletArr[idx].Substring(0, rule_maxLenEachBullet); } } //check max length for whole description ? //3. download 5(or 7) pics string[] picFullnameList = null; //debug downloadPictures(productUrl, productHtml, out picFullnameList); if ((picFullnameList != null) && (picFullnameList.Length > 0)) { int maxImageCount = 0; if (picFullnameList.Length > productInfo.imgUrlArr.Length) { maxImageCount = productInfo.imgUrlArr.Length; } else { maxImageCount = picFullnameList.Length; } for (int idx = 0; idx < maxImageCount; idx++) { productInfo.imgUrlArr[idx] = picFullnameList[idx]; } } //4.extract product seller info: price and name List<crifanLibAmazon.productSellerInfo> allSellerInfoList = new List<crifanLibAmazon.productSellerInfo>(); if (amazonLib.extractAllSellerInfo(usedAndNewUrl, out allSellerInfoList)) { if ((allSellerInfoList != null) && (allSellerInfoList.Count > 0)) { foreach (crifanLibAmazon.productSellerInfo eachSellerInfo in allSellerInfoList) { //(1) calc cheapest price if (eachSellerInfo.price < productInfo.cheapestPrice) { productInfo.cheapestPrice = eachSellerInfo.price; } //(2) find whether one of the sellers is Amazon //here means: one of the seller's name is: Amazon.com if (eachSellerInfo.name.Equals("Amazon.com", StringComparison.CurrentCultureIgnoreCase)) { productInfo.isOneSellerIsAmazon = true; } } if (productInfo.cheapestPrice.CompareTo(float.MaxValue) == 0) { gLogger.Info(String.Format("Omit this {0} for not find valid cheapest price for {1} ", productUrl, usedAndNewUrl)); extractProductInfoOk = false; return extractProductInfoOk; } else { gLogger.Info("Cheapest Price=" + productInfo.cheapestPrice); gLogger.Info("One of Seller is Amazon=" + productInfo.isOneSellerIsAmazon); } } else { gLogger.Info(String.Format("Omit this {0} for found seller info but is invalid for {1} ", productUrl, usedAndNewUrl)); extractProductInfoOk = false; return extractProductInfoOk; } } else { gLogger.Info(String.Format("Omit this {0} for not found seller info for {1} ", productUrl, usedAndNewUrl)); extractProductInfoOk = false; return extractProductInfoOk; } //5. 3 keyword Field productInfo.keywordFieldArr = amazonLib.extractProductKeywordField(productInfo.title, productInfo.keywordFieldArr.Length, rule_maxSingleKeywordFieldLen); gLogger.Info("Keyword Field List:"); if ((productInfo.keywordFieldArr != null) && (productInfo.keywordFieldArr.Length > 0)) { for (int idx = 0; idx < productInfo.keywordFieldArr.Length; idx++) { String keywordField = productInfo.keywordFieldArr[idx]; gLogger.Info(String.Format("[{0}]={1}", idx, keywordField)); } } //6. product review productInfo.reviewNumber = amazonLib.extractProductReviewNumber(productHtml: productHtml); gLogger.Info("ReviewNumber=" + productInfo.reviewNumber); //7. product best seller rank number list List<crifanLibAmazon.productBestRank> bestSellerRankList = amazonLib.extractProductBestSellerRankList(productHtml); if ((bestSellerRankList != null) && (bestSellerRankList.Count > 0)) { productInfo.isBestSeller = true; gLogger.Info("Is BestSeller=" + productInfo.isBestSeller); } else { gLogger.Debug(" or count not > 0 : " + bestSellerRankList.ToString()); gLogger.Info(String.Format("Omit this {0} for bestSellerRankList is empty", productUrl)); extractProductInfoOk = false; return extractProductInfoOk; } return extractProductInfoOk; ; } //check whether each product valid or not //if valid, extract product info //http://www.amazon.com/Silver-Linings-Playbook/dp/B00CL68QVQ/ref=sr_1_2?s=instant-video&ie=UTF8&qid=1368688342&sr=1-2 private void checkAndExtractForSingleProduct(string productUrl) { //debug //productUrl = "http://www.amazon.com/Paderno-World-Cuisine-A4982799-Tri-Blade/dp/B0007Y9WHQ/ref=lp_1055398_1_3?ie=UTF8&qid=1370596558&sr=1-3"; bool isProductValid = false; string invalidReason = ""; //string respHtml = crl.getUrlRespHtml(productUrl); string productHtml = crl.getUrlRespHtml_multiTry(productUrl); string usedAndNewUrl = ""; isProductValid = checkProductValid(productUrl, productHtml, out invalidReason, out usedAndNewUrl); if (isProductValid) { gLogger.Info("+VALID+ Product=" + productUrl); AmazonProductInfo productInfo; if (extractProductInfo(productUrl, productHtml, usedAndNewUrl, out productInfo)) { saveProductInfo(productInfo); } } else { gLogger.Info(String.Format("-INVALID- product={0}, reason={1}", productUrl, invalidReason)); } } //check whether each product variation valid or not //if valid, extract product info private void checkAndExtractForSingleVariation(crifanLibAmazon.variationItem singleVariationItem) { bool isProductValid = false; string invalidReason = ""; gLogger.Info("processing variation " + singleVariationItem.url); //string respHtml = crl.getUrlRespHtml(singleVariationItem.url); string productHtml = crl.getUrlRespHtml_multiTry(singleVariationItem.url); string usedAndNewUrl = ""; isProductValid = checkProductValid(singleVariationItem.url, productHtml, out invalidReason, out usedAndNewUrl); if (isProductValid) { gLogger.Info("Valid product=" + singleVariationItem.url); AmazonProductInfo productInfo; if (extractProductInfo(singleVariationItem.url, productHtml, usedAndNewUrl, out productInfo)) { //check whether the product title already have vartiation label in the end of title //if not, added it if (productInfo.title.EndsWith(singleVariationItem.label)) { //http://www.amazon.com/GE-MWF-Refrigerator-Filter-1-Pack/dp/B000AST3AK/ref=lp_1055398_1_4?ie=UTF8&qid=1370574186&sr=1-4 //title already added variation label: //GE MWF Refrigerator Water Filter, 1-Pack //also for: //http://www.amazon.com/gp/product/B003BIG0DO/ref=twister_B000AST3AK?ie=UTF8&psc=1 //GE SmartWater MWF Refrigerator Water Filter, 2-Pack } else { //http://www.amazon.com/Thermos-Insulated-18-Ounce-Stainless-Steel-Hydration/dp/B000FJ9DOK/ref=lp_1055398_1_6?ie=UTF8&qid=1370574186&sr=1-6 //and //http://www.amazon.com/gp/product/B0057FQCNC/ref=twister_B000FJ9DOK?ie=UTF8&psc=1 //has same title productInfo.title = productInfo.title + ", " + singleVariationItem.label; } saveProductInfo(productInfo); } } else { gLogger.Info(String.Format("Invalid product={0}, reason={1}",singleVariationItem.url, invalidReason)); } } private void processSinglePageHtml(string curPageSearchUrl, string singlePageHtml) { List<crifanLibAmazon.searchResultItem> searchedItemList = new List<crifanLibAmazon.searchResultItem>(); if (amazonLib.extractSearchItemList(curPageSearchUrl, singlePageHtml, out searchedItemList)) { foreach (crifanLibAmazon.searchResultItem eachSearchResultItem in searchedItemList) { if (!needContinueSearch) { break; } crifanLibAmazon.productVariationInfo variationInfo = new crifanLibAmazon.productVariationInfo(); gLogger.Info("processing single product url " + eachSearchResultItem.productUrl); if (amazonLib.checkVariation(eachSearchResultItem.productUrl, out variationInfo)) { //have many varation //process each variation List<crifanLibAmazon.variationItem> variationList = variationInfo.variationList; gLogger.Info(String.Format("Total {0} variations for {1}", variationList.Count, eachSearchResultItem.productUrl)); foreach (crifanLibAmazon.variationItem eachVariationItem in variationList) { if (!needContinueSearch) { break; } checkAndExtractForSingleVariation(eachVariationItem); } } else { //no variation -> only current single product //directly process this product gLogger.Info("no variation for " + eachSearchResultItem.productUrl); checkAndExtractForSingleProduct(eachSearchResultItem.productUrl); } } } } //"http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dappliances" private void processEachSearchCategory(string curPageSearchUrl) { gLogger.Info("processing search category " + curPageSearchUrl); string eachPageHtml = ""; //find all level 1 child category url list List<crifanLibAmazon.categoryItem> subCategoryList = amazonLib.extractSubCategoryList(curPageSearchUrl); foreach (crifanLibAmazon.categoryItem subCategory in subCategoryList) { bool hasMorePage = true; curPageSearchUrl = subCategory.Url; if (!needContinueSearch) { break; } //get each page html while (hasMorePage) { if (!needContinueSearch) { break; } //fisrt: //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dinstant-video //then: //http://www.amazon.com/s/ref=sr_pg_2?rh=n%3A2625373011%2Cn%3A%212644981011%2Cn%3A%212644982011%2Cn%3A2858778011&page=2&ie=UTF8&qid=1368697688 //eachPageHtml = crl.getUrlRespHtml(curPageSearchUrl); eachPageHtml = crl.getUrlRespHtml_multiTry(curPageSearchUrl); processSinglePageHtml(curPageSearchUrl, eachPageHtml); string nextPageUrl = ""; if (amazonLib.extractNextPageUrl(curPageSearchUrl, eachPageHtml, out nextPageUrl)) { if (nextPageUrl != "") { //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dinstant-video#/ref=sr_pg_2?rh=n%3A2858778011&page=2&ie=UTF8&qid=1368688123 //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3Dinstant-video#/ref=sr_pg_3?rh=n%3A2858778011&page=3&ie=UTF8&qid=1368688123 hasMorePage = true; } else { hasMorePage = false; break; } } else { //something wrong break; } } } } //find matched best seller category for input main category item public bool findMatchedBestSellerCategoryItem(crifanLibAmazon.categoryItem mainCateoryItem, out crifanLibAmazon.categoryItem bestSellerCateoryItem) { bool foundMatchedBestSeller = false; bestSellerCateoryItem = new crifanLibAmazon.categoryItem(); //Method 1: static mapping if (gMainCatMappingBestSellerCatDict != null && (gMainCatMappingBestSellerCatDict.Count > 0)) { if (gMainCatMappingBestSellerCatDict.ContainsKey(mainCateoryItem.Key)) { string bestSellerCategoryKey = gMainCatMappingBestSellerCatDict[mainCateoryItem.Key]; foreach (crifanLibAmazon.categoryItem singleBestSellerCatItem in bestSellerCategoryList) { if (bestSellerCategoryKey.Equals(singleBestSellerCatItem.Key, StringComparison.CurrentCultureIgnoreCase)) { bestSellerCateoryItem = singleBestSellerCatItem; foundMatchedBestSeller = true; break; } } } } return foundMatchedBestSeller; } private void searchSingleCategory(crifanLibAmazon.categoryItem singleCateoryItem) { //instant-video string curSearchCategoryKey = singleCateoryItem.Key; //1. general category url //instant-video //http://www.amazon.com/s/ref=nb_sb_noss?url=search-alias%3dinstant-video string generalCategoryUrl = amazonLib.generateMainCategoryUrlFromCategoryKey(curSearchCategoryKey); processEachSearchCategory(singleCateoryItem.Url); //2. Best Sellers crifanLibAmazon.categoryItem bestSellerCategoryItem; if (findMatchedBestSellerCategoryItem(singleCateoryItem, out bestSellerCategoryItem)) { gLogger.Info("Found corrsponding best seller item category url=" + bestSellerCategoryItem.Url); processEachSearchCategory(bestSellerCategoryItem.Url); } else { gLogger.Info("NOT found corrsponding best seller item category url, for: " + singleCateoryItem.Url); } } private void btnSearch_Click(object sender, EventArgs e) { if (curSearchStatus == search_status.SEARCH_STATUS_STOPPED) { needContinueSearch = true; //start search curSearchStatus = search_status.SEARCH_STATUS_SEARCHING; updateUI(); //mainCategorySearch(); cleanAndReinitBeforeSearch(); awsCategorySearch(); //end search curSearchStatus = search_status.SEARCH_STATUS_STOPPED; updateUI(); } } private void cleanAndReinitBeforeSearch() { rtbLog.Text = ""; gCurItemNum = 1; //re-init log filename initLoggerFilename(); //re-init logger initLogger(); } private void btnChangeOutputFolder_Click(object sender, EventArgs e) { DialogResult outputFolderResult = fbdOutputFolder.ShowDialog(); if (outputFolderResult == System.Windows.Forms.DialogResult.OK) { txbOutputFolder.Text = fbdOutputFolder.SelectedPath; afterChangeOutputFolder(); } //else if (outputFolderResult == System.Windows.Forms.DialogResult.Cancel) //{ // //} } private void btnOpenOutputFolder_Click(object sender, EventArgs e) { if (Directory.Exists(txbOutputFolder.Text)) { crl.openFileDirectly(txbOutputFolder.Text); } } private void btnBrowserOutputFile_Click(object sender, EventArgs e) { string currentOutputFullFilename = Path.Combine(txbOutputFolder.Text, txbExcelFilename.Text); gLogger.Debug("In btnBrowserOutputFile_Click:"); gLogger.Debug("OutputFolder=" + txbOutputFolder.Text); gLogger.Debug("ExcelFilename=" + txbExcelFilename.Text); gLogger.Debug("currentOutputFullFilename=" + currentOutputFullFilename); if (File.Exists(currentOutputFullFilename)) { crl.openFolderAndSelectFile(currentOutputFullFilename); } else { crl.openFolderAndSelectFile(txbOutputFolder.Text); } } private void btnOpenOutputFile_Click(object sender, EventArgs e) { string currentOutputFullFilename = Path.Combine(txbOutputFolder.Text, txbExcelFilename.Text); if (File.Exists(currentOutputFullFilename)) { crl.openFileDirectly(currentOutputFullFilename); } else if(Directory.Exists(txbOutputFolder.Text)) { crl.openFolderAndSelectFile(txbOutputFolder.Text); } } private void btnStop_Click(object sender, EventArgs e) { if (curSearchStatus == search_status.SEARCH_STATUS_SEARCHING) { curSearchStatus = search_status.SEARCH_STATUS_STOPPED; updateUI(); //do stop things needContinueSearch = false; } } /****************************** AWS ********************************/ private List<string> buildMainBrowserNodeNameList() { List<string> mainBrowserNodeNameList = new List<string>(); //http://docs.aws.amazon.com/AWSECommerceService/latest/DG/BrowseNodeIDs.html //mainBrowserNodeNameList.Add(""); mainBrowserNodeNameList.Add("Apparel"); mainBrowserNodeNameList.Add("Appliances"); mainBrowserNodeNameList.Add("ArtsAndCrafts"); mainBrowserNodeNameList.Add("Automotive"); mainBrowserNodeNameList.Add("Baby"); mainBrowserNodeNameList.Add("Beauty"); mainBrowserNodeNameList.Add("Books"); mainBrowserNodeNameList.Add("Classical"); mainBrowserNodeNameList.Add("Collectibles"); //Code=AWS.InvalidParameterValue, Message=195208011 is not a valid value for BrowseNodeId. Please change this value and retry your request. //mainBrowserNodeNameList.Add("DigitalMusic"); mainBrowserNodeNameList.Add("DVD"); mainBrowserNodeNameList.Add("Electronics"); mainBrowserNodeNameList.Add("ForeignBooks"); mainBrowserNodeNameList.Add("Garden"); //Request valid, but Error: Code=AWS.InvalidParameterValue, Message=3580501 is not a valid value for BrowseNodeId. Please change this value and retry your request. //mainBrowserNodeNameList.Add("GourmetFood"); mainBrowserNodeNameList.Add("Grocery"); mainBrowserNodeNameList.Add("HealthPersonalCare"); mainBrowserNodeNameList.Add("Hobbies"); mainBrowserNodeNameList.Add("Home"); //Request valid, but Error: Code=AWS.InvalidParameterValue, Message=285080 is not a valid value for BrowseNodeId. Please change this value and retry your request. mainBrowserNodeNameList.Add("HomeGarden"); ////https://www.crifan.org/amazon_asw_api_usage_notice/ ////PetSupplies=1063498, is just sub category of Home & Kitchen=1055398 ////mainBrowserNodeNameList.Add("PetSupplies"); //mainBrowserNodeNameList.Add("Home & Kitchen"); mainBrowserNodeNameList.Add("HomeImprovement"); mainBrowserNodeNameList.Add("Industrial"); mainBrowserNodeNameList.Add("Jewelry"); mainBrowserNodeNameList.Add("KindleStore"); mainBrowserNodeNameList.Add("Kitchen"); //mainBrowserNodeNameList.Add("LawnGarden"); //https://www.crifan.org/aws_searchindex_has_not_support_lawngarden_changed_to_lawnandgarden/ mainBrowserNodeNameList.Add("LawnAndGarden"); mainBrowserNodeNameList.Add("Lighting"); mainBrowserNodeNameList.Add("Magazines"); mainBrowserNodeNameList.Add("Miscellaneous"); mainBrowserNodeNameList.Add("MobileApps"); //Request valid, but Error: Code=AWS.InvalidParameterValue, Message=195211011 is not a valid value for BrowseNodeId. Please change this value and retry your request. //mainBrowserNodeNameList.Add("MP3Downloads"); mainBrowserNodeNameList.Add("Music"); mainBrowserNodeNameList.Add("MusicalInstruments"); mainBrowserNodeNameList.Add("OfficeProducts"); //mainBrowserNodeNameList.Add("OutdoorLiving"); //mainBrowserNodeNameList.Add("PCHardware"); //seem also miss this mainBrowserNodeNameList.Add("PetSupplies"); //mainBrowserNodeNameList.Add("Photo"); mainBrowserNodeNameList.Add("Shoes"); mainBrowserNodeNameList.Add("Software"); mainBrowserNodeNameList.Add("SoftwareVideoGames"); mainBrowserNodeNameList.Add("SportingGoods"); mainBrowserNodeNameList.Add("Tools"); mainBrowserNodeNameList.Add("Toys"); //Request valid, but Error: Code=AWS.InvalidParameterValue, Message=404272 is not a valid value for BrowseNodeId. Please change this value and retry your request. //mainBrowserNodeNameList.Add("VHS"); mainBrowserNodeNameList.Add("Video"); //mainBrowserNodeNameList.Add("VideoGames"); mainBrowserNodeNameList.Add("Watches"); //mainBrowserNodeNameList.Add("Wireless"); //mainBrowserNodeNameList.Add("WirelessAccessories"); return mainBrowserNodeNameList; } //tmp only support US //later will add: CA CN DE ES FR IN IT JP UK private List<string> buildMainBrowserNodeIdList() { List<string> mainBrowserNodeIdList = new List<string>(); //http://docs.aws.amazon.com/AWSECommerceService/latest/DG/BrowseNodeIDs.html //mainBrowserNodeIdList.Add("US"); mainBrowserNodeIdList.Add("1036592"); mainBrowserNodeIdList.Add("2619525011"); mainBrowserNodeIdList.Add("2617941011"); mainBrowserNodeIdList.Add("15690151"); mainBrowserNodeIdList.Add("165796011"); mainBrowserNodeIdList.Add("11055981"); mainBrowserNodeIdList.Add("1000"); mainBrowserNodeIdList.Add("301668"); mainBrowserNodeIdList.Add("4991425011"); //Code=AWS.InvalidParameterValue, Message=195208011 is not a valid value for BrowseNodeId. Please change this value and retry your request. //mainBrowserNodeIdList.Add("195208011"); mainBrowserNodeIdList.Add("2625373011"); mainBrowserNodeIdList.Add("493964"); mainBrowserNodeIdList.Add(""); mainBrowserNodeIdList.Add(""); //Request valid, but Error: Code=AWS.InvalidParameterValue, Message=3580501 is not a valid value for BrowseNodeId. Please change this value and retry your request. //mainBrowserNodeIdList.Add("3580501"); mainBrowserNodeIdList.Add("16310101"); mainBrowserNodeIdList.Add("3760931"); mainBrowserNodeIdList.Add(""); mainBrowserNodeIdList.Add(""); //Request valid, but Error: Code=AWS.InvalidParameterValue, Message=285080 is not a valid value for BrowseNodeId. Please change this value and retry your request. //mainBrowserNodeIdList.Add("285080"); mainBrowserNodeIdList.Add("1055398"); ////https://www.crifan.org/amazon_asw_api_usage_notice/ ////PetSupplies=1063498, is just sub category of Home & Kitchen=1055398 //mainBrowserNodeIdList.Add("1055398"); mainBrowserNodeIdList.Add(""); mainBrowserNodeIdList.Add("228239"); mainBrowserNodeIdList.Add("3880591"); mainBrowserNodeIdList.Add("133141011"); //seems miss this mainBrowserNodeIdList.Add("1063498"); mainBrowserNodeIdList.Add("2972638011"); mainBrowserNodeIdList.Add(""); mainBrowserNodeIdList.Add("599872"); mainBrowserNodeIdList.Add("10304191"); mainBrowserNodeIdList.Add("2350149011"); //Request valid, but Error: Code=AWS.InvalidParameterValue, Message=195211011 is not a valid value for BrowseNodeId. Please change this value and retry your request. //mainBrowserNodeIdList.Add("195211011"); mainBrowserNodeIdList.Add("301668"); mainBrowserNodeIdList.Add("11091801"); mainBrowserNodeIdList.Add("1084128"); //mainBrowserNodeIdList.Add("1063498"); //mainBrowserNodeIdList.Add("493964"); //http://www.browsenodes.com/node-2619533011.html mainBrowserNodeIdList.Add("2619533011"); //mainBrowserNodeIdList.Add("1063498"); //mainBrowserNodeIdList.Add("493964"); mainBrowserNodeIdList.Add(""); mainBrowserNodeIdList.Add("409488"); mainBrowserNodeIdList.Add(""); mainBrowserNodeIdList.Add("3375251"); mainBrowserNodeIdList.Add("468240"); //https://www.crifan.org/aws_api_toys_browsernodeid_493964_not_root_category/ //mainBrowserNodeIdList.Add("493964"); mainBrowserNodeIdList.Add("165793011"); //Request valid, but Error: Code=AWS.InvalidParameterValue, Message=404272 is not a valid value for BrowseNodeId. Please change this value and retry your request. //mainBrowserNodeIdList.Add("404272"); mainBrowserNodeIdList.Add("130"); //mainBrowserNodeIdList.Add("493964"); mainBrowserNodeIdList.Add("377110011"); //mainBrowserNodeIdList.Add("508494"); //mainBrowserNodeIdList.Add("13900851"); return mainBrowserNodeIdList; } private void initSingleTreeNode(TreeNode curTreeNode) { crifanLibAws.awsBrowseNode curBrowseNode = (crifanLibAws.awsBrowseNode)curTreeNode.Tag; ////debug ////www.crifan.org/amazon_asw_api_usage_notice/ //curBrowseNode.BrowseNodeId = "1055398"; //Home & Kitchen crifanLibAws.awsBrowseNodeLookupResp browseNodeLookupResp = aws.awsGetBrowseNodeLookupResp(curBrowseNode.BrowseNodeId); if (!string.IsNullOrEmpty(browseNodeLookupResp.selfBrowseNode.Name)) { //string nodeText = ""; if (curTreeNode.Parent != null) { //parent not null -> not root TreeNode -> node extracted name //nodeText = browseNodeLookupResp.selfBrowseNodeId.Name; } else { //no parent -> root TreeNode -> use original (initialized root category) name browseNodeLookupResp.selfBrowseNode.Name = curBrowseNode.Name; } curTreeNode.Text = browseNodeLookupResp.selfBrowseNode.Name; curTreeNode.Tag = browseNodeLookupResp.selfBrowseNode; if ((browseNodeLookupResp.Children != null) && (browseNodeLookupResp.Children.Count > 0)) { //for show in tree node foreach (crifanLibAws.awsBrowseNode childBrowseNode in browseNodeLookupResp.Children) { TreeNode subTreeNode = new TreeNode(); subTreeNode.Text = childBrowseNode.Name; subTreeNode.Tag = childBrowseNode; subTreeNode.ContextMenuStrip = cmsSelection; curTreeNode.Nodes.Add(subTreeNode); } gLogger.Info(String.Format("Category [{0}] : {1} chilren", curTreeNode.Text, browseNodeLookupResp.Children.Count)); } else { gLogger.Info(String.Format("Category [{0}] : No chilren", curTreeNode.Text)); } } else { gLogger.Debug("can not get BrowseNodeLookup Response for singleRootBrowseNodeId=" + curBrowseNode.BrowseNodeId); } } private void initAwsCategory() { List<string> mainBrowserNodeNameList = buildMainBrowserNodeNameList(); List<string> mainBrowserNodeIdList = buildMainBrowserNodeIdList(); gMainBrowserNodeList = new List<crifanLibAws.awsBrowseNode>(); for (int idx = 0; idx < mainBrowserNodeNameList.Count; idx++) { string mainBrowserNodeName = mainBrowserNodeNameList[idx]; string mainBrowserNodeId = mainBrowserNodeIdList[idx]; if (!string.IsNullOrEmpty(mainBrowserNodeId)) { crifanLibAws.awsBrowseNode mainBrowserNode = new crifanLibAws.awsBrowseNode(); mainBrowserNode.Name = mainBrowserNodeName; mainBrowserNode.BrowseNodeId = mainBrowserNodeId; gMainBrowserNodeList.Add(mainBrowserNode); } else { gLogger.Debug(String.Format("browser node id is empty for name={0} ", mainBrowserNodeName)); } } //init search category //cmbSearchCategory.DataSource = gMainBrowserNodeList; //cmbSearchCategory.DisplayMember = "name"; //foreach (crifanLibAws.awsBrowseNode mainBrowserNode in gMainBrowserNodeList) for (int idx = 0; idx < gMainBrowserNodeList.Count; idx++) { crifanLibAws.awsBrowseNode mainBrowserNode = gMainBrowserNodeList[idx]; gLogger.Trace(String.Format("[{0:D2}]{1}\t\t\t={2}", idx + 1, mainBrowserNode.Name, mainBrowserNode.BrowseNodeId)); TreeNode rootTreeNode = new TreeNode(); //rootTreeNode.Name = mainBrowserNode.Name; rootTreeNode.Text = mainBrowserNode.Name; rootTreeNode.Tag = mainBrowserNode; rootTreeNode.ContextMenuStrip = cmsSelection; trvCategoryTree.Nodes.Add(rootTreeNode); } } //get input TreeNode's BrowseNode's SearchIndex private string getSearchIndex(TreeNode curTreeNode) { string strSearchIndex = ""; //find the root node TreeNode rootTreeNode = crl.findRootTreeNode(curTreeNode); //here already makesure the root TreeNode name is SerchIndex if (rootTreeNode != null) { crifanLibAws.awsBrowseNode rootBrowseNode = (crifanLibAws.awsBrowseNode)rootTreeNode.Tag; strSearchIndex = rootBrowseNode.Name; } return strSearchIndex; } //get input TreeNode's BrowseNode's full category name //something like: // xxx -> xxx -> xxx private string getFullCategoryName(TreeNode curTreeNode) { string strFullCategoryName = ""; //init strFullCategoryName = curTreeNode.Text; TreeNode parentTreeNode = curTreeNode.Parent; //walk trough from current TreeNode to root TreeNode while (parentTreeNode != null) { strFullCategoryName = parentTreeNode.Text + " -> " + strFullCategoryName; parentTreeNode = parentTreeNode.Parent; } return strFullCategoryName; } private crifanLibAws.awsBrowseNode getCurSelBrowserNode() { crifanLibAws.awsBrowseNode curSelectedBrowserNode = new crifanLibAws.awsBrowseNode(); if (trvCategoryTree.SelectedNode != null) { curSelectedBrowserNode = (crifanLibAws.awsBrowseNode)trvCategoryTree.SelectedNode.Tag; } else { //can not use gLogger here, for it has not init yet //gLogger.Info("Not selected any tree node"); } return curSelectedBrowserNode; } private void searchSingleBrowseNodeId(string curBrowseNodeId, string curSearchIndex, string curFullCategoryName = "") { string strFullCategoryName = String.Format("FullCategoryName={0}", curFullCategoryName); string strFormattedFullCategoryName = crl.formatString(strFullCategoryName, '='); string strStartSearch = String.Format("Start search for BrowseNodeId={0}, SearchIndex={1}", curBrowseNodeId, curSearchIndex); string strFormattedStartSearch = crl.formatString(strStartSearch, '='); gLogger.Info(strFormattedStartSearch); gLogger.Info(strFormattedFullCategoryName); //get first page search result string strFirstPageNum = "1"; crifanLibAws.awsSearchResultInfo firstPageSearchResultInfo = aws.awsGetBrowserNodeSearchResultItemList(curBrowseNodeId, curSearchIndex, strFirstPageNum); if (firstPageSearchResultInfo.SearchResultItemList != null) { //gLogger.Info(String.Format("=== page {0} ===", strFirstPageNum)); foreach (crifanLibAws.awsSearchResultItem eachItem in firstPageSearchResultInfo.SearchResultItemList) { if (!needContinueSearch) { break; } processAwsSearchItem(eachItem); } } //process following page (page 2-10) search item list, if available if (firstPageSearchResultInfo.TotalPages != null) { int awsPageNumLimit = 10; int intTotalPages = Int32.Parse(firstPageSearchResultInfo.TotalPages); int maxPageNum = intTotalPages > awsPageNumLimit ? awsPageNumLimit : intTotalPages; for (int curPageNum = 2; curPageNum <= maxPageNum; curPageNum++) { if (!needContinueSearch) { break; } //gLogger.Info(String.Format("=== page {0} ===", curPageNum)); crifanLibAws.awsSearchResultInfo curPageItemList = aws.awsGetBrowserNodeSearchResultItemList(curBrowseNodeId, curSearchIndex, curPageNum.ToString()); if (curPageItemList.SearchResultItemList != null) { foreach (crifanLibAws.awsSearchResultItem eachItem in curPageItemList.SearchResultItemList) { if (!needContinueSearch) { break; } processAwsSearchItem(eachItem); } } } } string strEndSearch = String.Format("End of search for BrowseNodeId={0}, SearchIndex={1}", curBrowseNodeId, curSearchIndex); string strFormattedEndSearch = crl.formatString(strEndSearch, '='); gLogger.Info(strFormattedEndSearch); gLogger.Info(strFormattedFullCategoryName); } //for some TreeNode, first find all child node, then do search for each node private void doSearchForAllChildOfSingleTreeNode(TreeNode curTreeNode) { //find all sub child nodes, meanwhile do search if (curTreeNode != null) { //1. find all child int childNodeCount = 0; childNodeCount = curTreeNode.GetNodeCount(false); if (childNodeCount <= 0) { //(1) if no child -> maybe really no child, or has not init -> re-init to get all child initSingleTreeNode(curTreeNode); } //re-check, maybe above step has re-got some child childNodeCount = curTreeNode.GetNodeCount(false); if (childNodeCount > 0) { //(2) if has child -> must has init -> just process each child foreach (TreeNode childTreeNode in curTreeNode.Nodes) { if (!needContinueSearch) { break; } doSearchForAllChildOfSingleTreeNode(childTreeNode); } } else { //still no child, then do real search string curSearchIndex = getSearchIndex(curTreeNode); string curFullCategoryName = getFullCategoryName(curTreeNode); crifanLibAws.awsBrowseNode curBrowserNode = (crifanLibAws.awsBrowseNode)curTreeNode.Tag; searchSingleBrowseNodeId(curBrowserNode.BrowseNodeId, curSearchIndex, curFullCategoryName); } } } private void awsCategorySearch() { if (curSelTreeNodeList.Count <= 0) { string strNothingToSearch = "Not select any category, so nothing to search"; gLogger.Info(crl.formatString(strNothingToSearch, '#')); } else { string strSearchForAll = String.Format("Do search for total selected {0} categories", curSelTreeNodeList.Count); gLogger.Info(crl.formatString(strSearchForAll, '#')); for (int idx = 0; idx < curSelTreeNodeList.Count; idx++) { int num = idx + 1; TreeNode eachSelectedTreeNode = curSelTreeNodeList[idx]; string fullCategoryName = getFullCategoryName(eachSelectedTreeNode); gLogger.Info(String.Format("[{0}] {1}", num, fullCategoryName)); } gLogger.Info(crl.formatString("#", '#')); for (int idx = 0; idx < curSelTreeNodeList.Count; idx++) { int num = idx + 1; TreeNode eachSelectedTreeNode = curSelTreeNodeList[idx]; string fullCategoryName = getFullCategoryName(eachSelectedTreeNode); string strSearchForEach = String.Format("Process for selected category [{0}] {1}", num, fullCategoryName); gLogger.Info(crl.formatString(strSearchForEach, '#')); gLogger.Info(crl.formatString("#", '#')); //doSearchForAllChildOfSingleTreeNode(trvCategoryTree.SelectedNode); doSearchForAllChildOfSingleTreeNode(eachSelectedTreeNode); } } } public void processAwsSearchItem(crifanLibAws.awsSearchResultItem singleAwsSearchItem) { string asinToHandle; asinToHandle = singleAwsSearchItem.Asin; if (gProcessedAsinList.Contains(asinToHandle)) { gLogger.Debug(String.Format("omit ASIN={0} for has processed it", asinToHandle)); } else { //1. find variation if avaliable List<crifanLibAws.awsSearchResultItem> variationItemList = new List<crifanLibAws.awsSearchResultItem>(); variationItemList = aws.awsGetVariationItemList(asinToHandle); //2. real goto process each item foreach (crifanLibAws.awsSearchResultItem singleAsin in variationItemList) { if (!needContinueSearch) { break; } //process each ASIN (product) string realAsinToHandle = singleAsin.Asin; //note: //here ParentASIN maybe nulll -> should check it before use processAmazonItem(realAsinToHandle); gProcessedAsinList.Add(realAsinToHandle); } } } private bool awsItemIsValid(crifanLibAws.awsItemAttributes itemAttributes, out string invalidReason) { bool bItemIsValid = true; invalidReason = "valid item"; //1. check weight if (bItemIsValid) { string strWeightHundredthsPound = itemAttributes.itemDimensions.WeightPound; float fWeightPound; if (string.IsNullOrEmpty(strWeightHundredthsPound)) { fWeightPound = 0.0F; } else { float fWeightHundredthsPound = float.Parse(strWeightHundredthsPound); fWeightPound = fWeightHundredthsPound / 100.0F; } rule_maxWeightPounds = float.Parse(txbMaxWeightPounds.Text); if (fWeightPound <= rule_maxWeightPounds) { bItemIsValid = true; } else { bItemIsValid = false; invalidReason = String.Format("Weight is {0} pounds, more than weight limit: {1} pounds", fWeightPound, rule_maxWeightPounds); } } //2. check dimension if (bItemIsValid) { string strLengthHundredthsInch = (itemAttributes.itemDimensions.LengthHundredthsInch != null) ? itemAttributes.itemDimensions.LengthHundredthsInch : itemAttributes.packageDimensions.LengthHundredthsInch; string strWidthHundredthsInch = (itemAttributes.itemDimensions.WidthHundredthsInch != null) ? itemAttributes.itemDimensions.WidthHundredthsInch : itemAttributes.packageDimensions.WidthHundredthsInch; string strHeightHundredthsInch = (itemAttributes.itemDimensions.HeightHundredthsInch != null) ? itemAttributes.itemDimensions.HeightHundredthsInch : itemAttributes.packageDimensions.HeightHundredthsInch; float fLengthInch; if (string.IsNullOrEmpty(strLengthHundredthsInch)) { fLengthInch = 0.0F; } else { float fLengthHundredthsInch = float.Parse(strLengthHundredthsInch); fLengthInch = fLengthHundredthsInch / 100.0F; } float fWidthInch; if (string.IsNullOrEmpty(strWidthHundredthsInch)) { fWidthInch = 0.0F; } else { float fWeightHundredthsInch = float.Parse(strWidthHundredthsInch); fWidthInch = fWeightHundredthsInch / 100.0F; } float fHeightInch; if (string.IsNullOrEmpty(strHeightHundredthsInch)) { fHeightInch = 0.0F; } else { float fHeightHundredthsInch = float.Parse(strHeightHundredthsInch); fHeightInch = fHeightHundredthsInch / 100.0F; } float fLengthCm = crl.inchToCm(fLengthInch); float fWidthCm = crl.inchToCm(fWidthInch); float fHeightCm = crl.inchToCm(fHeightInch); rule_dimensionMaxLengthCm = float.Parse(txbDimensionLength.Text); rule_dimensionMaxWidthCm = float.Parse(txbDimensionWidth.Text); rule_dimensionMaxHeightCm = float.Parse(txbDimensionHeight.Text); //check valid or not if ( (fLengthCm <= rule_dimensionMaxLengthCm) && (fWidthCm <= rule_dimensionMaxWidthCm) && (fHeightCm <= rule_dimensionMaxHeightCm) ) { bItemIsValid = true; } else { bItemIsValid = false; invalidReason = String.Format("Dimension: {0}cm x {1}cm x {2}cm invalid for exceed dimension limit: {3}cm x {4}cm x {5}cm", fLengthCm, fWidthCm, fHeightCm, rule_dimensionMaxLengthCm, rule_dimensionMaxWidthCm, rule_dimensionMaxHeightCm); } } if (bItemIsValid) { //get offer full info //following info get by awsGetOffersInfo //is NOT unit number, but is OFFER number //eg: //B0009IQXFO //http://www.amazon.com/gp/offer-listing/B0009IQXFO //can see 18 offers //then here get: //Asin "B0009IQXFO" //TotalCollectible "0" //TotalNew "18" //TotalOfferPages "1" //TotalOffers "1" //TotalRefurbished "0" //TotalUsed "0" crifanLibAws.awsOffersInfo offersInfo = aws.awsGetOffersInfo(itemAttributes.Asin); //3. check buyer number if (bItemIsValid) { int totalOfferNum = 0; if( (!string.IsNullOrEmpty(offersInfo.TotalNew)) && (!string.IsNullOrEmpty(offersInfo.TotalUsed)) && (!string.IsNullOrEmpty(offersInfo.TotalCollectible)) && (!string.IsNullOrEmpty(offersInfo.TotalRefurbished)) ) { int intOfferTotalNew = Int32.Parse(offersInfo.TotalNew); int intOfferTotalUsed = Int32.Parse(offersInfo.TotalUsed); int intOfferTotalCollectible = Int32.Parse(offersInfo.TotalCollectible); int intOfferTotalRefurbished = Int32.Parse(offersInfo.TotalRefurbished); totalOfferNum = intOfferTotalNew + intOfferTotalUsed + intOfferTotalCollectible + intOfferTotalRefurbished; } else { totalOfferNum = 0; } rule_minimalBuyerNumber = Int32.Parse(txbMinBuyerNum.Text); if (totalOfferNum >= rule_minimalBuyerNumber) { bItemIsValid = true; } else { bItemIsValid = false; invalidReason = String.Format("buyer number {0} less than minimal limit {1}", totalOfferNum, rule_minimalBuyerNumber); } } //4. check total unit number if (bItemIsValid) { string itemAsin = itemAttributes.Asin; string productUrl = amazonLib.generateProductUrlFromAsin(itemAsin); bool bTotalUnitNumValid = checkTotalUnitNumber(productUrl, out invalidReason); if (bTotalUnitNumValid) { bItemIsValid = true; } else { bItemIsValid = false; //invalidReason = invalidReason; } } } return bItemIsValid; } public void processAmazonItem(string itemAsin) { gLogger.Trace("Processing amazon product ASIN=" + itemAsin); //get item info crifanLibAws.awsItemAttributes itemAttributes = aws.awsGetItemAttributes(itemAsin); //then check is valid or not string invalidReason = ""; bool bIsValid = awsItemIsValid(itemAttributes, out invalidReason); //debug //bIsValid = true; if (bIsValid) { string strValid = String.Format("[{0}] Valid: {1}", gCurItemNum++, itemAttributes.Asin); string strFormattedValid = crl.formatString(strValid, '-', 120); gLogger.Info(strFormattedValid); awsFindAndSaveItem(itemAttributes.Asin, itemAttributes); } else { string strInvalid = String.Format("[{0}] Invalid: {1}", gCurItemNum++, itemAttributes.Asin); string strFormattedInvalid = crl.formatString(strInvalid, '-', 120); gLogger.Info(strFormattedInvalid); gLogger.Info("InvalidReason=" + invalidReason); } } private void awsFindAndSaveItem(string itemAsin, crifanLibAws.awsItemAttributes itemAttributes) { //1. extract other necessary info //2. save product info AmazonProductInfo productInfo; if (awsGetAllProductInfo(itemAsin, itemAttributes, out productInfo)) { saveProductInfo(productInfo); } } private void awsDownloadPictures(string itemAsin, List<string> imageUrlList, int imgFullnameArrLength, out string[] savedImageUrlList) { //creat folder string strOutputImageFoler = Path.Combine(txbOutputFolder.Text, defaultOutputImageFolderName); string picFolderFullPath = Path.Combine(strOutputImageFoler,itemAsin); if (!Directory.Exists(picFolderFullPath)) { Directory.CreateDirectory(picFolderFullPath); } int maxImageCount = imageUrlList.Count > imgFullnameArrLength ? imgFullnameArrLength : imageUrlList.Count; savedImageUrlList = new string[maxImageCount]; for (int idx = 0; idx < maxImageCount; idx++) { int num = idx + 1; string imageUrl = imageUrlList[idx]; gLogger.Info(String.Format("[Image{0}]\t\t\t{1}", num, imageUrl)); string picFilename = crl.extractFilenameFromUrl(imageUrl); string picFullFilename = Path.Combine(picFolderFullPath, picFilename); string errorStr = ""; gLogger.Debug(String.Format("Downloading {0}", imageUrl)); gLogger.Debug(String.Format("to {0}", picFullFilename)); crl.downloadFile(imageUrl, picFullFilename, out errorStr, updateProgress); //update //savedFullPicNameList[idx] = picFullFilename; savedImageUrlList[idx] = imageUrl; } } private void checkDescriptionAndBullets(ref AmazonProductInfo productInfo, string description, List<string> bulletList) { //makesure bulletList is not null if (bulletList == null) { bulletList = new List<string>(); } /* * 1. if no description, use bullet * 2. if more than normal 5 bullets, get all bullets, just use first 5 bullets to description * 3. if no bullet, use description to split to 5 bullets */ //type1: has description, has bullet if ((!string.IsNullOrEmpty(description)) && (bulletList.Count > 0)) { productInfo.description = description; //bullets //maybe has more than 5 bullets //maybe less than 5 bullets //http://www.amazon.com/AmazonBasics-Lightning-Compatible-Cable-inch/dp/B00B5RGAWY/ref=sr_1_3?s=wireless&ie=UTF8&qid=1369753764&sr=1-3 //has feature-bullets_feature_div, but no content -> bulletsNodeList is null for (int idx = 0; idx < bulletList.Count; idx++) { string bulletStr = bulletList[idx]; //get first 5 -> to bullet if (idx < productInfo.bulletArr.Length) { productInfo.bulletArr[idx] = bulletStr; } else { //only need 5 bullets break; } } } //type2: no description, has bullet else if ( string.IsNullOrEmpty(description) && (bulletList.Count > 0)) { //bullets //maybe has more than 5 bullets //maybe less than 5 bullets for (int idx = 0; idx < bulletList.Count; idx++) { string bulletStr = bulletList[idx]; //get first 5 -> to bullet if (idx < productInfo.bulletArr.Length) { productInfo.bulletArr[idx] = bulletStr; } //all bullet -> description description = description + bulletStr + Environment.NewLine; } productInfo.description = description; } //type3: has description, no bullet else if ((!string.IsNullOrEmpty(description)) && (bulletList.Count == 0)) { productInfo.description = description; //seperate description to many lines string[] lines = description.Split('.'); //maybe less than 5, maybe greater than 5 for (int idx = 0; idx < lines.Length; idx++) { string curLine = lines[idx]; //get first 5 -> to bullet if (idx < productInfo.bulletArr.Length) { productInfo.bulletArr[idx] = curLine; } else { //only need 5 bullets break; } } } //type4: no description, no bullet else if ((string.IsNullOrEmpty(description)) && (bulletList.Count == 0)) { //something wrong //or just leave it productInfo.description = string.Empty; crl.emptyStringArray(productInfo.bulletArr); } } private bool awsGetAllProductInfo(string itemAsin, crifanLibAws.awsItemAttributes itemAttributes, out AmazonProductInfo productInfo) { gLogger.Debug("Extracting info for: " + itemAsin); //init bool extractProductInfoOk = true; string productUrl = amazonLib.generateProductUrlFromAsin(itemAsin); productInfo = new AmazonProductInfo(); productInfo.url = amazonLib.generateProductUrlFromAsin(itemAsin); productInfo.cheapestPrice = float.MaxValue; productInfo.isOneSellerIsAmazon = false; //must init, otherwise, when only got 4 bullet, here total 5 -> last is null -> assign later will exception productInfo.bulletArr = new string[5]; crl.emptyStringArray(productInfo.bulletArr); productInfo.imgUrlArr = new string[5]; crl.emptyStringArray(productInfo.imgUrlArr); productInfo.keywordFieldArr = new string[3]; crl.emptyStringArray(productInfo.keywordFieldArr); //1. title productInfo.title = itemAttributes.Title; //"Frigidaire FRA052XT7 5,000-BTU Mini Window Air Conditioner" gLogger.Info("[Title]\t\t\t" + productInfo.title); //2. description crifanLibAws.awsEditorialReview editorialReview = aws.awsGetEditorialReview(itemAttributes.Asin); string originContentHtml = editorialReview.Content; string description = crl.htmlRemoveTag(originContentHtml); description = description.Trim(); //3. bullets List<string> bulletList = itemAttributes.FeatureList; //check for bullets and description checkDescriptionAndBullets(ref productInfo, description, bulletList); //check max length for whole description rule_maxDescriptionLen = Int32.Parse(txbMaxDescriptionLen.Text); if (productInfo.description.Length > rule_maxDescriptionLen) { productInfo.description = productInfo.description.Substring(0, rule_maxDescriptionLen); } //check max length for each bullet rule_maxLenEachBullet = Int32.Parse(txbEachBulletMaxLen.Text); for (int idx = 0; idx < productInfo.bulletArr.Length; idx++) { if (productInfo.bulletArr[idx].Length > rule_maxLenEachBullet) { productInfo.bulletArr[idx] = productInfo.bulletArr[idx].Substring(0, rule_maxLenEachBullet); } } //output description int iDescToShowLen = 40; iDescToShowLen = (productInfo.description.Length > iDescToShowLen) ? iDescToShowLen : productInfo.description.Length; string strShowDes = productInfo.description.Substring(0, iDescToShowLen) + " ......"; gLogger.Info("[Description]\t\t" + strShowDes); //output bullets int iRealBulletNum = 0; for (int idx = 0; idx < productInfo.bulletArr.Length; idx++) { if (!string.IsNullOrEmpty(productInfo.bulletArr[idx])) { ++iRealBulletNum; } } gLogger.Info("[BulletList]\t\t\tTotal " + iRealBulletNum.ToString() + " bullets"); //3. download images //(1) get images List<string> imageUrlList = new List<string>(); crifanLibAws.awsImages imagesInfo = aws.awsGetImages(itemAsin); if (imagesInfo.LargeImageList != null) { foreach (crifanLibAws.awsImageItem singleImageItem in imagesInfo.LargeImageList) { string largeImageUrl = singleImageItem.Url; if (!imageUrlList.Contains(largeImageUrl)) { //makesure not duplicated imageUrlList.Add(largeImageUrl); } } } //here use AWS only can find Primary-> LargeImage //remaining custom images can not find //so need continue find more custom images string customImageUrl = amazonLib.generateCustomImageUrlFromAsin(itemAsin); List<string> customImageUrlList = amazonLib.extractCustomImageUrlList(customImageUrl); imageUrlList.AddRange(customImageUrlList); //(2) download images string[] savedImageUrlList = null; awsDownloadPictures(itemAsin, imageUrlList, productInfo.imgUrlArr.Length, out savedImageUrlList); if ((savedImageUrlList != null) && (savedImageUrlList.Length > 0)) { //here, already: savedImageUrlList.Length <= productInfo.imgUrlArr.Length for (int idx = 0; idx < savedImageUrlList.Length; idx++) { productInfo.imgUrlArr[idx] = savedImageUrlList[idx]; } } //4.extract product seller info: price and name //get all seller //int intStartPageNum = 1; //for here use awsGetOfferFullInfo ONLY get 2 offer //for more offers, API give you the MoreOffersUrl //http://www.amazon.com/gp/offer-listing/B003F4TH6G%3FSubscriptionId%3DAKIAJQAUAH2R4HCG63LQ%26tag%3Dcrifancom-20%26linkCode%3Dxm2%26camp%3D2025%26creative%3D386001%26creativeASIN%3DB003F4TH6G //it just is: //http://www.amazon.com/gp/offer-listing/B003F4TH6G //so can generate from ASIN //crifanLibAws.awsOfferFullInfo offerFullInfo = aws.awsGetOfferFullInfo(itemAsin, intStartPageNum); string offerListingUrl = amazonLib.generateOfferListingUrl(itemAsin); //"http://www.amazon.com/gp/offer-listing/B003F4TH6G" List<crifanLibAmazon.productSellerInfo> allSellerInfoList = new List<crifanLibAmazon.productSellerInfo>(); if (amazonLib.extractAllSellerInfo(offerListingUrl, out allSellerInfoList)) { if ((allSellerInfoList != null) && (allSellerInfoList.Count > 0)) { foreach (crifanLibAmazon.productSellerInfo eachSellerInfo in allSellerInfoList) { //(1) calc cheapest price if (eachSellerInfo.price < productInfo.cheapestPrice) { productInfo.cheapestPrice = eachSellerInfo.price; } //(2) find whether one of the sellers is Amazon //here means: one of the seller's name is: Amazon.com if (eachSellerInfo.name.Equals("Amazon.com", StringComparison.CurrentCultureIgnoreCase)) { productInfo.isOneSellerIsAmazon = true; } } if (productInfo.cheapestPrice.CompareTo(float.MaxValue) == 0) { gLogger.Info(String.Format("Omit this {0} for not find valid cheapest price", productUrl)); extractProductInfoOk = false; return extractProductInfoOk; } else { gLogger.Info("[CheapestPrice]\t\t" + productInfo.cheapestPrice); gLogger.Info("[OneOfSellerIsAmazon]\t" + productInfo.isOneSellerIsAmazon); } } else { gLogger.Info(String.Format("Omit this {0} for found seller info but is invalid", productUrl)); extractProductInfoOk = false; return extractProductInfoOk; } } else { gLogger.Info(String.Format("Omit this {0} for not found seller info for {1} ", productUrl, offerListingUrl)); extractProductInfoOk = false; return extractProductInfoOk; } //5. 3 keyword Field productInfo.keywordFieldArr = amazonLib.extractProductKeywordField(productInfo.title, productInfo.keywordFieldArr.Length, rule_maxSingleKeywordFieldLen); gLogger.Debug("Keyword Field List:"); if ((productInfo.keywordFieldArr != null) && (productInfo.keywordFieldArr.Length > 0)) { for (int idx = 0; idx < productInfo.keywordFieldArr.Length; idx++) { String keywordField = productInfo.keywordFieldArr[idx]; gLogger.Debug(String.Format("[{0}]={1}", idx, keywordField)); } } //6. product review string productHtml = crl.getUrlRespHtml_multiTry(productUrl); productInfo.reviewNumber = amazonLib.extractProductReviewNumber(productUrl, productHtml); gLogger.Info("[ReviewNumber]\t\t" + productInfo.reviewNumber); //7. product best seller rank number list List<crifanLibAmazon.productBestRank> bestSellerRankList = amazonLib.extractProductBestSellerRankList(productUrl); if ((bestSellerRankList != null) && (bestSellerRankList.Count > 0)) { productInfo.isBestSeller = true; gLogger.Info("[IsBestSeller]\t\t" + productInfo.isBestSeller); } else { gLogger.Debug("bestSellerRankList is null or count not > 0 : " + bestSellerRankList.ToArray().ToString()); gLogger.Info(String.Format("Omit this {0} for not found valid best seller rank info", productUrl)); extractProductInfoOk = false; return extractProductInfoOk; } return extractProductInfoOk; } private void rtbLog_TextChanged(object sender, EventArgs e) { rtbLog.SelectionStart = rtbLog.Text.Length; //Set the current caret position at the end rtbLog.ScrollToCaret(); //Now scroll it automatically } private bool categoryTreeNodeHasInitialized(TreeNode curSelectedCategoryNode) { bool hasInited = false; if (curSelectedCategoryNode != null) { int subNodeNum = trvCategoryTree.SelectedNode.GetNodeCount(true); if (subNodeNum > 0) { hasInited = true; } } return hasInited; } private void trvCategoryTree_DoubleClick(object sender, EventArgs e) { if (trvCategoryTree.SelectedNode != null) { if (!categoryTreeNodeHasInitialized(trvCategoryTree.SelectedNode)) { initSingleTreeNode(trvCategoryTree.SelectedNode); trvCategoryTree.SelectedNode.Expand(); } else { //trvCategoryTree.SelectedNode.Toggle(); } } } private void updateSelectionNotice() { if (curSelTreeNodeList.Count == 0) { txbCurFullCategoryName.Text = "Not select any category"; } else { txbCurFullCategoryName.Text = String.Format("Total select {0} categories:", curSelTreeNodeList.Count); for (int idx = 0; idx < curSelTreeNodeList.Count; idx++) { int num = idx + 1; TreeNode eachSelectedTreeNode = curSelTreeNodeList[idx]; string fullCategoryName = getFullCategoryName(eachSelectedTreeNode); txbCurFullCategoryName.Text += Environment.NewLine + String.Format("[{0}] {1}", num, fullCategoryName); } } } private void trvCategoryTree_AfterSelect(object sender, TreeViewEventArgs e) { updateSelectionNotice(); } private void cmsSelection_ItemClicked(object sender, ToolStripItemClickedEventArgs e) { TreeNode curSelTreeNode = trvCategoryTree.SelectedNode; if (e.ClickedItem == tsmiAddToSelection) { if (!curSelTreeNodeList.Contains(curSelTreeNode)) { // add to selection curSelTreeNodeList.Add(curSelTreeNode); //hightlight node crl.highlightNode(trvCategoryTree, curSelTreeNode); } } else if (e.ClickedItem == tsmiRemoveFromSelection) { if (curSelTreeNodeList.Contains(curSelTreeNode)) { //remove selection curSelTreeNodeList.Remove(curSelTreeNode); //unhightlight node crl.unHighlightNode(trvCategoryTree, curSelTreeNode); } } updateSelectionNotice(); } private void trvCategoryTree_MouseUp(object sender, MouseEventArgs e) { if (e.Button == MouseButtons.Right) { // Select the clicked node trvCategoryTree.SelectedNode = trvCategoryTree.GetNodeAt(e.X, e.Y); } } /****************************** AWS ********************************/ } }
【总结】
转载请注明:在路上 » 【代码分享】C#代码:ScrapeAmazonProduct – 抓取Amazon产品数据(主要从AWS API抓取,其次再从网页中抓取)