背景】
之前写的,前后共写了两个版本的:
Python 2.x版本
和
Python 3.x版本
去抓取
中联系人信息,并保存为excel文件
【scrape_chaosgroup_contact 代码分享】
1.截图:
(1)运行效果:
(2)保存为excel文件:
2.Python项目代码下载:
scrape_chaosgroup_contact_py2.7z
scrape_chaosgroup_contact_py3.7z
3.代码分享:
(1)Python 2.x版本的:scrape_chaosgroup_contact_py2.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Collect all data from a webpage
https://www.elance.com/j/collect-all-data-from-webpage/34563264/
Version: 2012-10-25
Author: Crifan Li
Contact: https://www.crifan.org/about/me/
-------------------------------------------------------------------------------
"""
#---------------------------------import---------------------------------------
import os;
import re;
import sys;
sys.path.append("libs/crifan");
sys.path.append("libs/thirdparty");
import math;
import time;
import codecs;
import logging;
import urllib;
from datetime import datetime,timedelta;
from optparse import OptionParser;
from string import Template,replace;
import xml;
from xml.sax import saxutils;
import crifanLib;
from BeautifulSoup import BeautifulSoup,Tag,CData;
import xlwt;
#--------------------------------const values-----------------------------------
__VERSION__ = "v0.1";
gConst = {
};
#----------------------------------global values--------------------------------
gVal = {
};
#--------------------------configurable values---------------------------------
gCfg ={
};
#--------------------------functions--------------------------------------------
#------------------------------------------------------------------------------
def main():
global gVal
global gCfg
allItemsDictList = [];
mainUrl = "http://www.chaosgroup.com/en/2/purchase.html?g=0&pID=1";
logging.debug("mainUrl=%s", mainUrl);
respHtml = crifanLib.getUrlRespHtml(mainUrl);
logging.debug("respHtml=%s", respHtml);
soup = BeautifulSoup(respHtml);
foundAllItems = soup.findAll(attrs={"class":"countryInfo"});
logging.debug("foundAllItems=%s", foundAllItems);
itemsLen = len(foundAllItems);
logging.info("Total found %d contact info", itemsLen);
for i,eachItemSoup in enumerate(foundAllItems):
itemDict = {
'country':"",
'name' : "",
'phone' : "",
'fax' : "",
'email' : "",
'vRay' :"",
'maxLink':"",
'address':"",
};
itemDict['country'] = eachItemSoup.h3.string;
logging.debug("itemDict['country']=%s", itemDict['country']);
foundName = eachItemSoup.find(attrs={"class":"name"});
if(foundName):
itemDict['name'] = foundName.string;
logging.debug("itemDict['name']=%s", itemDict['name']);
else:
logging.error("Can not find name");
sys.exit(2);
foundPhone = eachItemSoup.find(attrs={"class":"phone"});
logging.debug("foundPhone=%s", foundPhone);
if(foundPhone):
foundPhoneUni = unicode(foundPhone);
logging.debug("foundPhoneUni=%s", foundPhoneUni);
# case 1:
#<p class="phone"><strong>phone:</strong> 800.206.7886<br />
#<strong>fax:</strong> 503-295-6533</p>
# case 2:
# <p class="phone"><strong>phone:</strong> +1 800 854 4496 or outside US +1 407 833 0600<br />
# <strong>fax:</strong> +1 813 283 4906
# </p>
# case 3:
# <p class="phone"><strong>phone:</strong> 604 682 6639 x105 <br /><strong>phone:</strong> toll-free 1 800 682 6639 x105<br />
# <strong>fax:</strong> </p>
foundPhoneFax = re.search("<strong>phone:</strong> (?P<phone>.+)<br />\s*?<strong>fax:</strong> (?P<fax>.*)</p>", foundPhoneUni, re.S);
logging.debug("foundPhoneFax=%s", foundPhoneFax);
if(foundPhoneFax):
itemDict['phone'] = foundPhoneFax.group("phone");
itemDict['fax'] = foundPhoneFax.group("fax");
itemDict['phone'] = itemDict['phone'].strip();
itemDict['fax'] = itemDict['fax'].strip();
logging.debug("phone=%s,fax=%s", itemDict['phone'], itemDict['fax']);
else:
logging.error("Can not find phone and fax");
sys.exit(2);
else:
logging.error("Can not find phone");
sys.exit(2);
foundWeb = eachItemSoup.find(attrs={"class":"web"});
logging.debug("foundWeb=%s", foundWeb);
if(foundWeb):
foundWebUni = unicode(foundWeb);
logging.debug("foundWebUni=%s", foundWebUni);
# <p class="web"><strong>e-mail:</strong> <a href="#">sales@cinesysinc.com</a><br />
# <strong>V-Ray|Max link:</strong> <a target="_blank" href="http://www.cinesysinc.com/page3/page20/page20.html">Cinesys</a>
# </p>
foundEmailInfo = re.search('<strong>e-mail:</strong> <a href="\#">(?P<email>.+)</a><br />\s*<strong>V-Ray\|Max link:</strong> <a target="_blank" href="(?P<maxLink>.+)">(?P<vRay>.+)</a>', foundWebUni);
logging.debug("foundEmailInfo=%s", foundEmailInfo);
if(foundEmailInfo):
itemDict['email'] = foundEmailInfo.group("email");
itemDict['maxLink'] = foundEmailInfo.group("maxLink");
itemDict['vRay'] = foundEmailInfo.group("vRay");
itemDict['email'] = itemDict['email'].strip();
itemDict['maxLink'] = itemDict['maxLink'].strip();
itemDict['vRay'] = itemDict['vRay'].strip();
logging.debug("email=%s,maxLink=%s,vRay=%s", itemDict['email'], itemDict['maxLink'], itemDict['vRay']);
else:
logging.error("Can not find email info");
sys.exit(2);
else:
logging.error("Can not find web");
sys.exit(2);
foundAddr = eachItemSoup.find(attrs={"class":"addr"});
logging.debug("foundAddr=%s", foundAddr);
if(foundAddr):
foundAddrUni = unicode(foundAddr);
# <p class="addr">
# <strong>address:</strong> 740 SW 21st Ave, Suite #310<br />
# Portland 97205 Oregon;<br />
# USA </p>
foundAddress = re.search('<p class="addr">\s*<strong>address:</strong> (?P<address>.+)</p>', foundAddrUni, re.S);
if(foundAddress):
itemDict['address'] = foundAddress.group("address");
itemDict['address'] = itemDict['address'].replace("<br />", "");
itemDict['address'] = itemDict['address'].strip();
logging.debug("address=%s", itemDict['address']);
else:
logging.error("Can not find address");
sys.exit(2);
else:
logging.error("Can not find addr");
sys.exit(2);
logging.debug("----------------- parse [%d] OK: %s", i, itemDict);
logging.info("Successfully processed %d contact info", i);
allItemsDictList.append(itemDict);
#output into excel
style0 = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00');
style1 = xlwt.easyxf(num_format_str='D-MMM-YY');
styleBoldRed = xlwt.easyxf('font: name Times New Roman, color-index red, bold on');
wb = xlwt.Workbook();
ws = wb.add_sheet('AllContactInfo');
ws.write(0, 0, "Country", styleBoldRed);
ws.write(0, 1, "Name", styleBoldRed);
ws.write(0, 2, "Phone", styleBoldRed);
ws.write(0, 3, "Fax", styleBoldRed);
ws.write(0, 4, "Email", styleBoldRed);
ws.write(0, 5, "Vray", styleBoldRed);
ws.write(0, 6, "MaxLink", styleBoldRed);
ws.write(0, 7, "Address", styleBoldRed);
for idx,eachItemDict in enumerate(allItemsDictList):
num = idx + 1;
ws.write(num, 0, eachItemDict['country']);
ws.write(num, 1, eachItemDict['name']);
ws.write(num, 2, eachItemDict['phone']);
ws.write(num, 3, eachItemDict['fax']);
ws.write(num, 4, eachItemDict['email']);
ws.write(num, 5, eachItemDict['vRay']);
ws.write(num, 6, eachItemDict['maxLink']);
ws.write(num, 7, eachItemDict['address']);
excelFilename = "allExtractedWebsiteData.xls";
logging.info("Now save all data info excel file: %s", excelFilename);
wb.save(excelFilename);
###############################################################################
if __name__=="__main__":
scriptSelfName = crifanLib.extractFilename(sys.argv[0]);
logging.basicConfig(
level = logging.DEBUG,
format = 'LINE %(lineno)-4d %(levelname)-8s %(message)s',
datefmt = '%m-%d %H:%M',
filename = scriptSelfName + ".log",
filemode = 'w');
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler();
console.setLevel(logging.INFO);
# set a format which is simpler for console use
formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
# tell the handler to use this format
console.setFormatter(formatter);
logging.getLogger('').addHandler(console);
try:
main();
except:
logging.exception("Unknown Error !");
raise;(2)Python 3.x版本的:scrape_chaosgroup_contact_py3.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
[Function]
Collect all data from a webpage
https://www.elance.com/j/collect-all-data-from-webpage/34563264/
Version: 2012-10-25
Author: Crifan Li
Contact: https://www.crifan.org/about/me/
[NOTE]
This script is for Python 3.x
before you can use this script, should do:
1.install bs4(BeautifulSoup version 4)
http://www.crummy.com/software/BeautifulSoup/bs4/download/beautifulsoup4-4.1.3.tar.gz
->
setup.py install
2. install xlwt3
http://pypi.python.org/pypi/xlwt3/0.1.0
->
http://pypi.python.org/packages/source/x/xlwt3/xlwt3-0.1.0.tar.gz
->
setup.py install
3. modify installed xlwt3
after install, change
Python32\Lib\site-packages\
->
xlwt3\BIFFRecords.py
->
WriteAccessRecord -> __init__
from :
self._rec_data = pack('%ds%ds' % (uowner_len, 0x70 - uowner_len),
uowner, b' '*(0x70 - uowner_len)) # (to_py3): added b'...'
to:
self._rec_data = pack('%ds%ds' % (uowner_len, 0x70 - uowner_len),
uowner.encode("utf-8"), b' '*(0x70 - uowner_len)) # (to_py3): added b'...'
-------------------------------------------------------------------------------
"""
#---------------------------------import---------------------------------------
import os;
import re;
import sys;
sys.path.append("libs/crifan");
sys.path.append("libs/thirdparty");
import math;
import time;
import codecs;
import logging;
import urllib.request, urllib.parse, urllib.error;
from datetime import datetime,timedelta;
from optparse import OptionParser;
import xml;
from xml.sax import saxutils;
import crifanLib;
#from BeautifulSoup import BeautifulSoup,Tag,CData;
from bs4 import BeautifulSoup,Tag,CData;
#import xlwt;
import xlwt3 as xlwt;
#--------------------------------const values-----------------------------------
__VERSION__ = "v0.1";
gConst = {
};
#----------------------------------global values--------------------------------
gVal = {
};
#--------------------------configurable values---------------------------------
gCfg ={
};
#------------------------------------------------------------------------------
def main():
global gVal
global gCfg
allItemsDictList = [];
mainUrl = "http://www.chaosgroup.com/en/2/purchase.html?g=0&pID=1";
logging.debug("mainUrl=%s", mainUrl);
respHtml = crifanLib.getUrlRespHtml(mainUrl);
#print("type(respHtml)=", type(respHtml));
#respHtml = respHtml.decode("UTF-8");
#logging.debug("respHtml=%s", respHtml);
soup = BeautifulSoup(respHtml, from_encoding="UTF-8");
foundAllItems = soup.findAll(attrs={"class":"countryInfo"});
#logging.debug("foundAllItems=%s", foundAllItems);
itemsLen = len(foundAllItems);
logging.info("Total found %d contact info", itemsLen);
for i,eachItemSoup in enumerate(foundAllItems):
itemDict = {
'country':"",
'name' : "",
'phone' : "",
'fax' : "",
'email' : "",
'vRay' :"",
'maxLink':"",
'address':"",
};
itemDict['country'] = eachItemSoup.h3.string;
#logging.debug("itemDict['country']=%s", itemDict['country']);
foundName = eachItemSoup.find(attrs={"class":"name"});
if(foundName):
itemDict['name'] = foundName.string;
#logging.debug("itemDict['name']=%s", itemDict['name']);
else:
logging.error("Can not find name");
sys.exit(2);
foundPhone = eachItemSoup.find(attrs={"class":"phone"});
#logging.debug("foundPhone=%s", foundPhone);
if(foundPhone):
#print("foundPhone=%s", foundPhone);
#foundPhoneUni = str(foundPhone).encode("UTF-8");
foundPhoneUni = str(foundPhone);
#print("foundPhoneUni=", foundPhoneUni);
#print("type(foundPhoneUni)=", type(foundPhoneUni));
#print("foundPhoneUni.encode('GB18030')=%s", foundPhoneUni.encode('GB18030'));
#print("type(foundPhone)=", type(foundPhone));
#foundPhoneString = foundPhone.string;
#print("type(foundPhoneString)=", type(foundPhoneString));
#print("foundPhoneString=", foundPhoneString);
#foundPhoneUni = foundPhone.decode("UTF-8");
#logging.debug("foundPhoneUni=%s", foundPhoneUni);
# case 1:
#<p class="phone"><strong>phone:</strong> 800.206.7886<br />
#<strong>fax:</strong> 503-295-6533</p>
# case 2:
# <p class="phone"><strong>phone:</strong> +1 800 854 4496 or outside US +1 407 833 0600<br />
# <strong>fax:</strong> +1 813 283 4906
# </p>
# case 3:
# <p class="phone"><strong>phone:</strong> 604 682 6639 x105 <br /><strong>phone:</strong> toll-free 1 800 682 6639 x105<br />
# <strong>fax:</strong> </p>
#foundPhoneFax = re.search("<strong>phone:</strong> (?P<phone>.+)<br />\s*?<strong>fax:</strong> (?P<fax>.*)</p>", foundPhoneUni, re.S);
#print("dir(foundPhoneUni)=", dir(foundPhoneUni));
#foundPhoneUtf8 = foundPhoneUni.encode("UTF-8");
#print("foundPhoneUtf8=", foundPhoneUtf8);
#foundPhoneFax = re.search("<strong>phone:</strong> (?P<phone>.+)<br />\s*?<strong>fax:</strong> (?P<fax>.*)</p>", foundPhoneUtf8, re.S);
#<p class="phone"><strong>phone:</strong> 866-905-2050<br />\r\n\t\t\t\t<strong>fax:</strong> 800 542 7928</p>
#foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br />\s*?<strong>fax:</strong>(?P<fax>.*)</p>", foundPhoneUni, re.S);
#foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br />.+?<strong>fax:</strong>(?P<fax>.*)</p>", foundPhoneUni, re.S);
#foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br />", foundPhoneUni);
#foundPhoneUtf8= b'<p class="phone"><strong>phone:</strong>\xc2\xa0800.206.7886<br/>\n<strong>fax:</strong>\xc2\xa0503-295-6533</p>'
foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br\s*/>\s*?<strong>fax:</strong>(?P<fax>.*)</p>", foundPhoneUni, re.S);
#logging.debug("foundPhoneFax=%s", foundPhoneFax);
if(foundPhoneFax):
itemDict['phone'] = foundPhoneFax.group("phone");
itemDict['fax'] = foundPhoneFax.group("fax");
itemDict['phone'] = itemDict['phone'].strip();
itemDict['fax'] = itemDict['fax'].strip();
#logging.debug("phone=%s,fax=%s", itemDict['phone'], itemDict['fax']);
else:
logging.error("Can not find phone and fax");
sys.exit(2);
else:
logging.error("Can not find phone");
sys.exit(2);
foundWeb = eachItemSoup.find(attrs={"class":"web"});
#logging.debug("foundWeb=%s", foundWeb);
if(foundWeb):
foundWebUni = str(foundWeb);
#logging.debug("foundWebUni=%s", foundWebUni);
# <p class="web"><strong>e-mail:</strong> <a href="#">sales@cinesysinc.com</a><br />
# <strong>V-Ray|Max link:</strong> <a target="_blank" href="http://www.cinesysinc.com/page3/page20/page20.html">Cinesys</a>
# </p>
#foundEmailInfo = re.search('<strong>e-mail:</strong> <a href="\#">(?P<email>.+)</a><br />\s*<strong>V-Ray\|Max link:</strong> <a target="_blank" href="(?P<maxLink>.+)">(?P<vRay>.+)</a>', foundWebUni);
#foundWebUtf8 = foundWebUni.encode("UTF-8");
#print("foundWebUtf8=", foundWebUtf8);
# foundWebUtf8= b'<p class="web"><strong>e-mail:</strong>\xc2\xa0<a href="#">info@3dv.com</a><br/>\n<strong>V-Ray|Max link:</strong>\xc2\xa0<a href="http://www.3dv.com/#/Rendering_Solutions/Chaos_Group/VRay/" target="_blank">3DV Corporation</a>\n</p>'
foundEmailInfo = re.search('<strong>e-mail:</strong>.*?<a href="\#">(?P<email>.+)</a><br\s*/>\s*<strong>V-Ray\|Max link:</strong>.*?<a href="(?P<maxLink>.+)" target="_blank">(?P<vRay>.+)</a>', foundWebUni);
#logging.debug("foundEmailInfo=%s", foundEmailInfo);
if(foundEmailInfo):
itemDict['email'] = foundEmailInfo.group("email");
itemDict['maxLink'] = foundEmailInfo.group("maxLink");
itemDict['vRay'] = foundEmailInfo.group("vRay");
itemDict['email'] = itemDict['email'].strip();
itemDict['maxLink'] = itemDict['maxLink'].strip();
itemDict['vRay'] = itemDict['vRay'].strip();
#logging.debug("email=%s,maxLink=%s,vRay=%s", itemDict['email'], itemDict['maxLink'], itemDict['vRay']);
else:
logging.error("Can not find email info");
sys.exit(2);
else:
logging.error("Can not find web");
sys.exit(2);
foundAddr = eachItemSoup.find(attrs={"class":"addr"});
#logging.debug("foundAddr=%s", foundAddr);
if(foundAddr):
foundAddrUni = str(foundAddr);
# <p class="addr">
# <strong>address:</strong> 740 SW 21st Ave, Suite #310<br />
# Portland 97205 Oregon;<br />
# USA </p>
#foundAddress = re.search('<p class="addr">\s*<strong>address:</strong> (?P<address>.+)</p>', foundAddrUni, re.S);
#foundAddrUtf8 = foundAddrUni.encode("UTF-8");
#print("foundAddrUtf8=", foundAddrUtf8);
#foundAddrUtf8= b'<p class="addr">\n<strong>address:</strong>\xc2\xa0Kiacheli, 26<br/>\r\n\t\t\t\t\tTbilisi 0108 ;<br/>\r\n\t\t\t\t\tGeorgia\t\t\t\t</p>'
foundAddress = re.search('<p class="addr">\s*<strong>address:</strong>(?P<address>.+)</p>', foundAddrUni, re.S);
if(foundAddress):
itemDict['address'] = foundAddress.group("address");
#itemDict['address'] = itemDict['address'].replace("<br />", "");
itemDict['address'] = re.sub("<br\s*/>", "", itemDict['address']);
itemDict['address'] = itemDict['address'].strip();
#logging.debug("address=%s", itemDict['address']);
else:
logging.error("Can not find address");
sys.exit(2);
else:
logging.error("Can not find addr");
sys.exit(2);
#logging.debug("----------------- parse [%d] OK: %s", i, itemDict);
logging.info("Successfully processed %d contact info", i);
allItemsDictList.append(itemDict);
#output into excel
style0 = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00');
style1 = xlwt.easyxf(num_format_str='D-MMM-YY');
styleBoldRed = xlwt.easyxf('font: name Times New Roman, color-index red, bold on');
wb = xlwt.Workbook();
ws = wb.add_sheet('AllContactInfo');
ws.write(0, 0, "Country", styleBoldRed);
ws.write(0, 1, "Name", styleBoldRed);
ws.write(0, 2, "Phone", styleBoldRed);
ws.write(0, 3, "Fax", styleBoldRed);
ws.write(0, 4, "Email", styleBoldRed);
ws.write(0, 5, "Vray", styleBoldRed);
ws.write(0, 6, "MaxLink", styleBoldRed);
ws.write(0, 7, "Address", styleBoldRed);
for idx,eachItemDict in enumerate(allItemsDictList):
num = idx + 1;
ws.write(num, 0, eachItemDict['country']);
ws.write(num, 1, eachItemDict['name']);
ws.write(num, 2, eachItemDict['phone']);
ws.write(num, 3, eachItemDict['fax']);
ws.write(num, 4, eachItemDict['email']);
ws.write(num, 5, eachItemDict['vRay']);
ws.write(num, 6, eachItemDict['maxLink']);
ws.write(num, 7, eachItemDict['address']);
excelFilename = "allExtractedWebsiteData.xls";
logging.info("Now save all data info excel file: %s", excelFilename);
wb.save(excelFilename);
###############################################################################
if __name__=="__main__":
scriptSelfName = crifanLib.extractFilename(sys.argv[0]);
logging.basicConfig(
level = logging.DEBUG,
format = 'LINE %(lineno)-4d %(levelname)-8s %(message)s',
datefmt = '%m-%d %H:%M',
filename = scriptSelfName + ".log",
filemode = 'w');
# define a Handler which writes INFO messages or higher to the sys.stderr
console = logging.StreamHandler();
console.setLevel(logging.INFO);
# set a format which is simpler for console use
formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
# tell the handler to use this format
console.setFormatter(formatter);
logging.getLogger('').addHandler(console);
try:
main();
except:
logging.exception("Unknown Error !");
raise;
【总结】
转载请注明:在路上 » 【代码分享】Python代码:scrape_chaosgroup_contact(Python 2.x版本和Python 3.x版本) – 抓取chaosgroup.com中的联系人信息保存为excel