背景】
之前写的,去处理本地已有的一个html文件,
然后对于提取出来的信息,导出为,各种形式的json字符串。
【scrape_html_to_json代码分享】
1.截图:
(1)运行效果:
(2)输出的各种json字符串:
A。无格式化,无缩进:
[{"yearMonth": {"month": {"string": "November", "value": "11"}, "year": {"string": "2012", "value": "2012"}}, "reservedMonthList": ["2", "3", "8", "9", "10", "11", "12", "13", "17", "18", "19", "20", "21", "22", "23"]}, {"yearMonth": {"month": {"string": "December", "value": "12"}, "year": {"string": "2012", "value": "2012"}}, "reservedMonthList": ["7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "21", "22", "23", "24", "25", "26", "27", "28", "30", "31"]}]B。普通的:
$calendar = {"listing_id1":{"1":
{"start_date": 2/11/2012,
"end_date": 3/11/2012,
"status": reserved
},"2":
{"start_date": 8/11/2012,
"end_date": 13/11/2012,
"status": reserved
},"3":
{"start_date": 17/11/2012,
"end_date": 23/11/2012,
"status": reserved
},},
"listing_id2":{"1":
{"start_date": 7/12/2012,
"end_date": 16/12/2012,
"status": reserved
},"2":
{"start_date": 21/12/2012,
"end_date": 28/12/2012,
"status": reserved
},"3":
{"start_date": 30/12/2012,
"end_date": 31/12/2012,
"status": reserved
},},
"listing_id3":{"1":
{"start_date": 1/1/2013,
"end_date": 10/1/2013,
"status": reserved
},},
"listing_id4":{"1":
{"start_date": 1/2/2013,
"end_date": 27/2/2013,
"status": reserved
},},
"listing_id5":{},
"listing_id6":{"1":
{"start_date": 2/4/2013,
"end_date": 30/4/2013,
"status": reserved
},},
"listing_id7":{"1":
{"start_date": 1/5/2013,
"end_date": 31/5/2013,
"status": reserved
},},
"listing_id8":{"1":
{"start_date": 1/6/2013,
"end_date": 30/6/2013,
"status": reserved
},},
"listing_id9":{"1":
{"start_date": 1/7/2013,
"end_date": 31/7/2013,
"status": reserved
},},
"listing_id10":{"1":
{"start_date": 1/8/2013,
"end_date": 31/8/2013,
"status": reserved
},},
"listing_id11":{"1":
{"start_date": 1/9/2013,
"end_date": 30/9/2013,
"status": reserved
},},
"listing_id12":{"1":
{"start_date": 1/10/2013,
"end_date": 31/10/2013,
"status": reserved
},},
}C。带缩进的格式化的json:
[
{
"yearMonth": {
"month": {
"string": "November",
"value": "11"
},
"year": {
"string": "2012",
"value": "2012"
}
},
"reservedMonthList": [
"2",
"3",
"8",
"9",
"10",
"11",
"12",
"13",
"17",
"18",
"19",
"20",
"21",
"22",
"23"
]
},
{
"yearMonth": {
"month": {
"string": "December",
"value": "12"
},
"year": {
"string": "2012",
"value": "2012"
}
},
"reservedMonthList": [
"7",
"8",
"9",
"10",
"11",
"12",
"13",
"14",
"15",
"16",
"21",
"22",
"23",
"24",
"25",
"26",
"27",
"28",
"30",
"31"
]
}
]
注:以上内容不全部相同。只是为了显示效果。
2.Python项目代码下载:
scrape_html_to_json_2012-11-08.7z
3.代码分享:
(1)scrape_html_to_json.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Function:
Web Scraper
https://www.elance.com/j/web-scraper/35025238/
Version: 2012-11-08
Author: Crifan Li
Contact: https://www.crifan.org/about/me/
-------------------------------------------------------------------------------
"""
#---------------------------------import---------------------------------------
import re;
import sys;
sys.path.append("libs");
#import urllib;
import codecs;
from string import Template,replace;
import json;
from BeautifulSoup import BeautifulSoup,Tag,CData;
#------------------------------------------------------------------------------
# from ['2','3','8','9','10','11','12','13','17','18','19','20','21','22','23']
# to
#[
#{
#'startDay': 2,
#'endDay' : 3,
#},
#{
#'startDay': 8,
#'endDay' : 13,
#},
#{
#'startDay': 17,
#'endDay' : 23,
#},
#]
def generateDurationDictList(monthList):
durationMonthDictList = [];
#print "monthList=",monthList;
if(monthList):
monthIntList = [];
for eachMonthStr in monthList:
monthIntList.append(int(eachMonthStr));
monthIntList.sort();
#print "monthIntList=",monthIntList;
curStartMonth = monthIntList.pop(0);
#print "curStartMonth=",curStartMonth;
curEndMonth = curStartMonth;
curInterMonth = curStartMonth;
startFindNewDuration = False;
while(monthIntList):
currentMonthInt = monthIntList.pop(0);
#print "---currentMonthInt=",currentMonthInt;
if(currentMonthInt == (curInterMonth+1)):
startFindNewDuration = True;
curInterMonth = curInterMonth + 1;
#print "after add 1, curInterMonth=",curInterMonth;
else:
durationInfoDict = {
'startDay': curStartMonth,
'endDay' : curInterMonth,
};
durationMonthDictList.append(durationInfoDict);
startFindNewDuration = False;
curEndMonth = currentMonthInt;
curStartMonth = currentMonthInt;
curInterMonth = currentMonthInt;
if(startFindNewDuration):
startFindNewDuration = False;
durationInfoDict = {
'startDay': curStartMonth,
'endDay' : currentMonthInt,
};
durationMonthDictList.append(durationInfoDict);
#print "durationMonthDictList=",durationMonthDictList;
#else:
#print "input monthList is null";
return durationMonthDictList;
#------------------------------------------------------------------------------
def generateOutputCalendar(MonthDictList):
#print "MonthDictList=",MonthDictList;
wholeStr = "";
headerStr = "$calendar = {";
tailStr = "}";
allMonthStr = "";
for index,eachMonthDict in enumerate(MonthDictList):
number = index + 1;
singleMonthWholeStr = "";
monthHeaderStr = '"listing_id'+str(number)+'":{';
monthTailStr = "},";
monthDurationListStr = "";
#print "============ now process year=%s, month=%s"%(eachMonthDict['yearMonth']['year']['string'], eachMonthDict['yearMonth']['month']['string']);
durationInfoDictList = generateDurationDictList(eachMonthDict['reservedMonthList']);
for durationIdx,eachDurationDict in enumerate(durationInfoDictList):
durationNum = durationIdx + 1;
singelDurationT = Template(""""${number}":
{"start_date": ${startDay}/${startMonth}/${startYear},
"end_date": ${endDay}/${endMonth}/${endYear},
"status": reserved
},""");
singleDurationDict = {
'number' : durationNum,
'startDay' : eachDurationDict['startDay'],
'startMonth' : eachMonthDict['yearMonth']['month']['value'],
'startYear' : eachMonthDict['yearMonth']['year']['value'],
'endDay' : eachDurationDict['endDay'],
'endMonth' : eachMonthDict['yearMonth']['month']['value'],
'endYear' : eachMonthDict['yearMonth']['year']['value'],
};
# "1":
# {"start_date": 11/7/2012,
# "end_date": 11/9/2012,
# "status": reserved
# },
singelDurationStr = singelDurationT.substitute(singleDurationDict);
#print "singelDurationStr=",singelDurationStr;
monthDurationListStr += singelDurationStr;
singleMonthWholeStr = monthHeaderStr + monthDurationListStr + monthTailStr;
#print "singleMonthWholeStr=",singleMonthWholeStr;
allMonthStr += singleMonthWholeStr + "\r\n\r\n";
wholeStr = headerStr + allMonthStr + tailStr;
#print "wholeStr=",wholeStr;
return wholeStr;
#------------------------------------------------------------------------------
def generateOutputCalendarJsonNoIndent(MonthDictList):
jsonDumpsNoIndent = json.dumps(MonthDictList);
#print "jsonDumpsNoIndent=",jsonDumpsNoIndent;
return jsonDumpsNoIndent;
#------------------------------------------------------------------------------
def main():
testEntryUrl = "http://testingsite.com/CalendarViewPublic.asp?HouseID=39";
foundSingleAttrFromUrl = re.search("http:.+?\?(?P<singleAttr>\w+)=.*?", testEntryUrl);
#print "foundSingleAttrFromUrl=",foundSingleAttrFromUrl;
if(foundSingleAttrFromUrl):
singleAttr = foundSingleAttrFromUrl.group("singleAttr");
print "Extract singleAttr=%s from testEntryUrl=%s"%(singleAttr, testEntryUrl);
testFilename = "testfiles/test_scrape.htm";
htmlFile = codecs.open(testFilename, 'r', "UTF-8");
#print "htmlFile=",htmlFile;
testHtml = htmlFile.read();
#print "testHtml=",testHtml;
soup = BeautifulSoup(testHtml);
#<table border="0" cellpadding="2" cellspacing="0" class="text" width="100%">
foundAllMonthHeader = soup.findAll(name="table", attrs={"class":"text"});
#print "foundAllMonthHeader=",foundAllMonthHeader;
monthHeaderLen = len(foundAllMonthHeader);
#print "monthHeaderLen=",monthHeaderLen;
#<table border="1" class="CalendarCellActive" cellpadding="2" cellspacing="0" style=" border: 1px solid navy; table-layout:fixed" width="100%">
foundAllMonthContent = soup.findAll(name="table", attrs={"class":"CalendarCellActive"});
#print "foundAllMonthContent=",foundAllMonthContent;
monthContentLen = len(foundAllMonthContent);
#print "monthContentLen=",monthContentLen;
print "Total found %d month's info of reserved days"%(monthContentLen);
MonthDictList = [];
for i,eachMonthHeader in enumerate(foundAllMonthHeader):
singleMonthDict = {
'yearMonth' :{
'year' : {
'value' : "",
'string': "",
},
'month' : {
'value' : "",
'string': "",
},
},
'reservedMonthList':[], # each one is singel string of month
};
#Note:
#here, actually, the simplest method to extract the year and month label is:
#just find two label, then consider the first is month and second is year
# foundTwoLabel = eachMonthHeader.findAll("label");
# print "foundTwoLabel=",foundTwoLabel;
# monthLabel = foundTwoLabel[0];
# yearLabel = foundTwoLabel[1];
# monthStr = monthLabel.string;
# yearStr = yearLabel.string;
# print "monthStr=",monthStr; # monthStr= November
# print "yearStr=",yearStr; # yearStr= 2012
# but that kind of method is not safe and robust
#so use following code
# <td style="padding-left:0" width="60%"><label>November</label>
# <input type="Hidden" id="cboMonth1" name="cboMonth1" value="11">
# </td><td style="padding-right:0;" width="40%">
# <label>2012</label>
# <input type="Hidden" id="cboYear1" name="cboYear1" value="2012">
# </td>
foundCboMonth = eachMonthHeader.find("input", {"id":re.compile("cboMonth\d+")});
#print "foundCboMonth=",foundCboMonth;
monthValue = foundCboMonth['value'];
#print "monthValue=",monthValue;
tdMonth = foundCboMonth.parent;
#print "tdMonth=",tdMonth;
tdMonthLabel = tdMonth.label;
#print "tdMonthLabel=",tdMonthLabel;
monthStr = tdMonthLabel.string;
#print "monthStr=",monthStr;
foundCboYear = eachMonthHeader.find("input", {"id":re.compile("cboYear\d+")});
#print "foundCboYear=",foundCboYear;
yearValue = foundCboYear['value'];
#print "yearValue=",yearValue;
tdYear = foundCboYear.parent;
#print "tdYear=",tdYear;
tdYearLabel = tdYear.label;
#print "tdYearLabel=",tdYearLabel;
yearStr = tdYearLabel.string;
#print "yearStr=",yearStr;
singleMonthDict['yearMonth']['month']['string'] = monthStr;
singleMonthDict['yearMonth']['month']['value'] = monthValue;
singleMonthDict['yearMonth']['year']['string'] = yearStr;
singleMonthDict['yearMonth']['year']['value'] = yearValue;
# extract the necessary content: the reserved days
eachMonthContent = foundAllMonthContent[i];
#<td align="center" class="CalendarCellReserved" id="dd1">2</td>
foundAllReservedCell = eachMonthContent.findAll("td", {"class":"CalendarCellReserved"});
#print "foundAllReservedCell=",foundAllReservedCell;
reservedCellNum = len(foundAllReservedCell);
#print "reservedCellNum=",reservedCellNum;
for eachReservedCell in foundAllReservedCell:
cellVal = eachReservedCell.string;
#print "cellVal=",cellVal;
singleMonthDict['reservedMonthList'].append(cellVal);
#print "singleMonthDict=",singleMonthDict;
MonthDictList.append(singleMonthDict);
#print str(i+1) + "="*79;
print "Processed %d month's info"%(i+1);
# generate output string
generatedCalendarStr = generateOutputCalendar(MonthDictList);
#print "generatedCalendarStr=",generatedCalendarStr;
outputFileName = "generatedCalerdarString.txt";
print "Exporting generated calendar string into %s"%(outputFileName);
outputFile = codecs.open(outputFileName, 'w', 'utf-8');
outputFile.write(generatedCalendarStr);
outputFile.close();
print "Has exported calendar string into %s"%(outputFileName);
# Note:
# only makesure your expected output is somthing like:
# {"start_date": "11/7/2012",
# "end_date": "11/9/2012",
# "status": "reserved"
# },
# not :
# {"start_date": 11/7/2012,
# "end_date": 11/9/2012,
# "status": reserved
# },
# then I can use json to ouptut PRETTY-PRINTED dict string
#------------------------------------------------------------------------------
def generateOutputCalendarJsonIndent(MonthDictList):
jsonDumpsIndent = json.dumps(MonthDictList, indent=1);
#print "jsonDumpsIndent=",jsonDumpsIndent;
return jsonDumpsIndent;
# json ouput demo
demoDictList = MonthDictList[0:2];
jsonDumpsIndentStr = json.dumps(demoDictList, indent=1);
outputFile_json_indent = "treeLikeWithIndentJsonString.txt";
outputFile_json_indent = codecs.open(outputFile_json_indent, 'w', 'utf-8');
outputFile_json_indent.write(jsonDumpsIndentStr);
outputFile_json_indent.close();
#tttttttt
generatedCalendarJsonIndentStr = generateOutputCalendarJsonIndent(demoDictList);
print "type(generatedCalendarJsonIndentStr)=",type(generatedCalendarJsonIndentStr);
#print "generatedCalendarJsonIndentStr=",generatedCalendarJsonIndentStr;
outputFileName_json_indent = "generatedCalerdarString_json_indent.txt";
print "Exporting generated calendar string json indent into %s"%(outputFileName_json_indent);
outputFile_json_indent = codecs.open(outputFileName_json_indent, 'w', 'utf-8');
outputFile_json_indent.write(generatedCalendarJsonIndentStr);
outputFile_json_indent.close();
generatedCalendarJsonNoIndentStr = generateOutputCalendarJsonNoIndent(demoDictList);
print "type(generatedCalendarJsonNoIndentStr)=",type(generatedCalendarJsonNoIndentStr);
#print "generatedCalendarJsonNoIndentStr=",generatedCalendarJsonNoIndentStr;
outputFileName_json_noIndent = "generatedCalerdarString_json_noIndent.txt";
print "Exporting generated calendar string json no indent into %s"%(outputFileName_json_noIndent);
outputFile_json_noIndent = codecs.open(outputFileName_json_noIndent, 'w', 'utf-8');
outputFile_json_noIndent.write(generatedCalendarJsonNoIndentStr);
outputFile_json_noIndent.close();
###############################################################################
if __name__=="__main__":
main();
【总结】
转载请注明:在路上 » 【代码分享】Python代码:scrape_html_to_json – 从本地html中抓取信息导出为各种形式的json字符串