#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
【作者】 www.crifan.org
【功能】
实现:
http://topic.csdn.net/u/20120520/05/a029e241-22a1-4acf-aa58-b3e306e333f3.html
中的需求。
【说明】
请将如下内容:
<?xml version="1.0" encoding="utf-8"?>
<config>
<ProductDetail brand="品胜" category="首页|电脑整机首页|电脑整机|笔记本配件" id="106537" model="S195B90" price="199.0" productID="189709" siteName="360buy" title="品胜笔记本电脑电源适配器S195B90 适用于索尼笔记本电脑" url="http://www.360buy.com/product/189709.html"/>
<ProductDetail brand="维氏" category="首页|日用百货首页|礼品箱包|瑞士军刀" color="黄色" id="106538" model="救生员0.8623.MWN(荧光刀柄带尼龙刀套)" price="409.0" productID="189704" siteName="360buy" title="瑞士军刀救生员0.8623.MWN(荧光刀柄带尼龙刀套)" url="http://www.360buy.com/product/189704.html"/>
<ProductDetail brand="" category="首页|日用百货首页|礼品箱包|中国军刀" color="黑色" id="106539" model="救生员0.9999.MWN(荧光刀柄带尼龙刀套)" price="499.0" productID="189705" siteName="360buy" title="瑞士军刀救生员0.9999.MWN(荧光刀柄带尼龙刀套)" url="http://www.360buy.com/product/189705.html"/>
</config>
存为UTF-8格式的productConfig.xml,即可实现你的需求,输出三个文件了
"""
import os;
import re;
import codecs;
productListDict = {}; # store id:singleProductDict
configFileName = "productConfig.xml";
brandFileName = "brands.txt";
titlesFileName = "titles.txt";
emptyBrandTitlesFileName = "emptyBrandTitlesFileName.txt";
print "input config file name is %s"%configFileName;
cfgFile = codecs.open(configFileName, 'r', 'utf-8');
cfgUni = cfgFile.read();
print "cfgUni=",cfgUni;
print "type(cfgUni)=",type(cfgUni);
foundProductList = re.findall('<ProductDetail.*?category=".+?".+?id="\d+" model=".+?" price="\d+?\.\d+?" productID="\d+" siteName=".+?" title=".+?" url=".+?"/>', cfgUni);
print "foundProductList=",foundProductList;
print "len(foundProductList)=",len(foundProductList);
if(foundProductList):
for eachProduct in foundProductList:
foundProductInfo = re.search('<ProductDetail( brand="(?P<brand>.*?)")? category="(?P<category>.+?)"( color="(?P<color>.+?)")? id="(?P<id>\d+)" model="(?P<model>.+?)" price="(?P<price>\d+?\.\d+?)" productID="(?P<productID>\d+)" siteName="(?P<siteName>.+?)" title="(?P<title>.+?)" url="(?P<url>.+?)"/>', eachProduct);
print "foundProductInfo=",foundProductInfo;
if(foundProductInfo):
wholeItemStr = foundProductInfo.group(0);
brand = foundProductInfo.group("brand");
category = foundProductInfo.group("category");
color = foundProductInfo.group("color");
id = foundProductInfo.group("id");
model = foundProductInfo.group("model");
price = foundProductInfo.group("price");
productID = foundProductInfo.group("productID");
siteName = foundProductInfo.group("siteName");
title = foundProductInfo.group("title");
url = foundProductInfo.group("url");
print "brand=%s,category=%s,color=%s,id=%s,model=%s,price=%s,productID=%s,siteName=%s,title=%s,url=%s," \
%(brand, category, color,id,model,price,productID,siteName,title,url);
singleProductDict = {
'wholeItemStr': wholeItemStr,
'brand' : brand,
'category' : category,
'color' : color,
'id' : id,
'model' : model,
'price' : price,
'productID' : productID,
'siteName' : siteName,
'title' : title,
'url' : url,
};
productListDict[id] = singleProductDict;
brandFile = codecs.open(brandFileName, 'a+', 'utf-8');
productFile = codecs.open(titlesFileName, 'a+', 'utf-8');
emptyBrandTitlesFile= codecs.open(emptyBrandTitlesFileName, 'a+', 'utf-8');
for i,productId in enumerate(productListDict):
print "---[%d]---"%i;
singleProductDict = productListDict[productId];
brand = singleProductDict['brand'];
print "brand=",brand;
if(brand):
brandFile.write(brand);
brandFile.write("\r\n");
productFile.write(singleProductDict['title']);
productFile.write("\r\n");
else:
emptyBrandTitlesFile.write(singleProductDict['title']);
emptyBrandTitlesFile.write("\r\n");
brandFile.close();
productFile.close();
emptyBrandTitlesFile.close();转载请注明:在路上 » 随便写点python代码,实现产品信息解析并输出