抓取车辆信息
抓取汽车之家的车辆信息和车辆图片
CarBrands.java
/** @author : TF-BJ-C064* @creation : 2014-8-19 上午9:57:38* @description : **/package com.car;import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;public class CarBrands {private String name;private String url;private String bid;private List series = new ArrayList();public void add(CarSerie cb ){series.add(cb);}public CarBrands(){}public CarBrands(String name, String href) {this.name = name;this.setUrl(href);}public String getName() {return name;}public void setName(String name) {this.name = name;}public String getUrl() {return url;}//e.g: http://car.autohome.com.cn/price/brand-22.html ==> bid = b22 public void setUrl(String url) {this.url = url;if( url != null ){String regex = "-(\\d+).html";Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(url); if (matcher.find()) {String group = matcher.group(1);this.setBid(group);}else {//System.out.println("no matches!!");} }}public String getBid() {return bid;}public void setBid(String bid) {this.bid = bid;}public List getSeries() {return series;}public void setSeries(List series) {this.series = series;}}
CarModels.java
/*
* @author : TF-BJ-C064
* @creation : 2014-8-19 下午2:26:13
* @description :
*
*/package com.car;import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;//车辆类型
public class CarModels {private String name;private String url;private String price;//指导价格private String level;//级别private String structure;//车身结构private String Engine;//发动机private String Transmission;//变速箱private String size;//车身尺寸private String imageurl;private List images = new ArrayList();public boolean add(CarSerieImage imageurl){return images.add(imageurl);}public List getImages() {return images;}public void setImages(List images) {this.images = images;}public String getName() {return name;}public void setName(String name) {this.name = name;}//e.g parurl = http://www.autohome.com.cn/buycar.html?specid=19460pvareaid=101622public void parseAsetUrl(String parurl){if(parurl==null)return ;String regex = "specid=(\\d+)&";Pattern pattern = Pattern.compile(regex);Matcher matcher = pattern.matcher(parurl); int pagesum = 0;if (matcher.find()) {String group = matcher.group(1);this.setUrl(group);}}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public String getPrice() {return price;}public void setPrice(String price) {this.price = price;}public String getLevel() {return level;}public void setLevel(String level) {this.level = level;}public String getStructure() {return structure;}public void setStructure(String structure) {this.structure = structure;}public String getEngine() {return Engine;}public void setEngine(String engine) {Engine = engine;}public String getTransmission() {return Transmission;}public void setTransmission(String transmission) {Transmission = transmission;}public String getImageurl() {return imageurl;}public void setImageurl(String imageurl) {this.imageurl = imageurl;}public String getSize() {return size;}public void setSize(String size) {this.size = size;}}
CarSerie.java
/** @author : TF-BJ-C064* @creation : 2014-8-19 上午11:48:14* @description : **/package com.car;import java.util.ArrayList;
import java.util.List;public class CarSerie {private String name;private String url;private String price;//指导价格private String level;//级别private String structure;//车身结构private String Engine;//发动机private String Transmission;//变速箱private String extInfo="";private String extInfoHtml;private List carYearList = new ArrayList();public boolean add(CarYear cy){return this.carYearList.add(cy);}public CarSerie(){}public CarSerie(String name, String href){this.name = name;// http://car.autohome.com.cn/pic/series/66.html;int index = href.lastIndexOf(".html");this.url = href ; //href.substring(0, index) + "-1.html";}public String getName() {return name;}public void setName(String name) {this.name = name;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}public List getCarYearList() {return carYearList;}public void setCarYearList(List carYearList) {this.carYearList = carYearList;}public String getPrice() {return price;}public void setPrice(String price) {this.price = price;}public String getLevel() {return level;}public void setLevel(String level) {this.level = level;}public String getStructure() {return structure;}public void setStructure(String structure) {this.structure = structure;}public String getEngine() {return Engine;}public void setEngine(String engine) {Engine = engine;}public String getTransmission() {return Transmission;}public void setTransmission(String transmission) {Transmission = transmission;}public String getExtInfo() {return extInfo;}public void addExtInfo(String extInfoIn) {if(extInfoIn==null)return ;if(this.extInfo!=null && !extInfo.trim().isEmpty())this.extInfo += ", ";this.extInfo += extInfoIn;}public void setExtInfo(String extInfoIn) {if(extInfoIn!=null)this.extInfo = extInfo;}public String getExtInfoHtml() {return extInfoHtml;}public void setExtInfoHtml(String extInfoHtml) {this.extInfoHtml = extInfoHtml;}}
CarSerieImage.java
/*
* @author : TF-BJ-C064
* @creation : 2014-8-19 上午11:52:54
* @description :
*
*/package com.car;public class CarSerieImage {private String title ;private String src ;public CarSerieImage(){}public CarSerieImage(String title, String src){this.title = title;this.src = src;}public String getTitle() {if(title==null || title.trim().isEmpty())return ""+System.currentTimeMillis();return title;}public void setTitle(String title) {this.title = title;}public String getSrc() {return src;}public void setSrc(String src) {this.src = src;}}
CarTree.java
/*
* @author : TF-BJ-C064
* @creation : 2014-8-19 上午9:59:06
* @description :
*
*/package com.car;import java.util.ArrayList;
import java.util.List;public class CarTree {private List tree = new ArrayList();public boolean add(CarBrands carbs){return tree.add(carbs);}public List getTree() {return tree;}public void setTree(List tree) {this.tree = tree;}}
CarYear.java
/*
* @author : TF-BJ-C064
* @creation : 2014-8-19 下午2:48:56
* @description :
*
*/package com.car;import java.util.ArrayList;
import java.util.List;// 车辆款式,如2013款
public class CarYear {private String name;private List carModels = new ArrayList();public boolean add(CarModels cm){return this.carModels.add(cm);}public CarYear(){};public CarYear(String name){this.name = name;}public String getName() {return name;}public void setName(String name) {this.name = name;}public List getCarModels() {return carModels;}public void setCarModels(List carModels) {this.carModels = carModels;}}
QCZJmain.java
/** @author : TF-BJ-C064* @creation : 2014-8-19 上午9:31:38* @description : **/package com.car;import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.Date;import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.params.CoreConnectionPNames;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.xvolks.jnative.exceptions.NativeException;import sun.audio.AudioPlayer;
import sun.audio.AudioStream;/*** 抓取汽车之家汽车数据* Title: QCZJmain
* Description: * args:* args[0]: 车辆信息存储根路径* args[1]: 是否下载图片,默认false* args[2]:是否抓取车辆款式,default=true* e.g:* run.batecho off color 0ajava -jar CarInfoCrawl.jar D:/craw/car20140821 truepause*
* Company:
* @author * @date 2014-8-21*/
public class QCZJmain {public static boolean debug = false;public static int MAX_DEBUG_LINE = 5;private String DIR_ROOT = "data/cars";public static String BASE_URL = "http://car.autohome.com.cn";public static String CAR_SPEC_URL = "http://www.autohome.com.cn/spec/"; // http://www.autohome.com.cn/spec/19390public static String Item_URL = BASE_URL+"/AsLeftMenu/As_LeftListNew.ashx?"; // AsLeftMenu/As_LeftListNew.ashx?typeId=1&brandId=34&fctId=0&seriesId=0public String carhtml = "AB巴博斯(9)
宝骏(82)
宝马(537)
保时捷(148)
北京汽车(20)
北汽幻速(8)
北汽威旺(50)
北汽新能源(3)
北汽制造(29)
奔驰(399)
奔腾(103)
本田(265)
比亚迪(251)
标致(292)
别克(266)
宾利(36)
布加迪(3)
CDFGHJKLMNOQRSTWXYZ";private CarTree cartree = new CarTree(); private boolean bDownloadImage = false;//是否下载图片,默认falseprivate boolean bGetModelDetail = true;public static void main(String[] args) {QCZJmain q = new QCZJmain();if(args!=null){if(args.length >=1 ){q.setDIR_ROOT(args[0]);System.out.println("reset DIR_ROOT : "+q.getDIR_ROOT());}if(args.length >=2 ){if(args[1].equals("true")) q.setbDownloadImage(true);elseq.setbDownloadImage(false);}if(args.length >=3 ){if(args[1].equals("true")) q.setbGetModelDetail(true);elseq.setbGetModelDetail(false);}}q.init();File ftemp = new File(q.getDIR_ROOT());System.out.println("DIR_ROOT : " + ftemp.getAbsolutePath());boolean bsucces = false;while(!bsucces){q.readConfig();//q.play();for(int i=5; i>0; i--){try {System.out.println("wait "+i+" s");Thread.sleep(1000);} catch (InterruptedException e1) {e1.printStackTrace();}}try {bsucces = q.start();} catch (Exception e) {e.printStackTrace();}if(!bsucces){println("Retry.. in 5 Minutes ");try {Thread.sleep(5000);} catch (InterruptedException e) {e.printStackTrace();}}}println("=============== Finish ==============");}public void play(){try {InputStream is = this.getClass().getResourceAsStream("/[000279].wav");AudioStream as=new AudioStream(is);AudioPlayer.player.start(as);}catch (Exception e) {e.printStackTrace();}}//private int carBrandsNumber = 0;private int carSerieNumber = 0;private int carYearNumber = 0;private int carImagesNumber = 0;private int intTemp = 0;private String configFilePath = this.DIR_ROOT + "/config.ini";private String exceptionLogFilePath = this.DIR_ROOT + "/error.log";public void init(){File froot = new File(DIR_ROOT);if(!froot.exists())froot.mkdirs();configFilePath = this.DIR_ROOT + "/config.ini";carserielistPath = this.DIR_ROOT + "/carserielist.txt";exceptionLogFilePath = this.DIR_ROOT + "/error.log";}public void readConfig(){//read config.iniFile configfile = new File(this.configFilePath);if(configfile.exists()){try {FileInputStream fis = new FileInputStream(configfile);BufferedReader dr=new BufferedReader(new InputStreamReader(fis));while(true){try {String line = dr.readLine();if(line==null)break ;if(line!=null && !line.trim().isEmpty()){String items[] = line.split(",");if(items!=null && items.length >=4){intTemp = Integer.parseInt(items[0]);carBrandsNumber = intTemp;intTemp = Integer.parseInt(items[1]);carSerieNumber = intTemp;intTemp = Integer.parseInt(items[2]);carImagesNumber = intTemp;intTemp = Integer.parseInt(items[3]);carYearNumber = intTemp;}}} catch (IOException e) {e.printStackTrace();}}} catch (FileNotFoundException e) {e.printStackTrace();}}//read carserielist.txtSystem.out.println("readConfig "+configfile.getAbsolutePath());System.out.println("carBrandsNumber carSerieNumber carImagesNumber carYearNumber");System.out.println(carBrandsNumber +" \t "+ carSerieNumber +" \t "+ carImagesNumber +" \t "+ carYearNumber);}public void saveConfig(int sum1, int sum2, int sum4, int sum3, FileOutputStream cofigout){try {String str = sum1 + "," + sum2 +"," +sum4+ ","+sum3 + ", "+sdf.format(new Date())+"\r\n";cofigout.write( str.getBytes() );} catch (IOException e2) {e2.printStackTrace();}}public static void print(String str){System.out.print(str);}public static void println(String str){System.out.println(str);}// get root car, 获取品牌列表CarBrandspublic void Step1(){println("======= Step 1 ======");Document html = Jsoup.parse(carhtml);Elements emItem = html.select("ul li em");if(emItem!=null)emItem.remove();Elements items = html.select("ul li a");for(Element em : items){if(em!=null){String href = BASE_URL + em.attr("href");String name = em.text();cartree.add(new CarBrands(name, href));}}println("cartree.size=" + cartree.getTree().size());}private String carserielistPath = this.DIR_ROOT + "/carserielist.txt";//get series 获取各个品牌CarBrands的各个系列CarSeriepublic void Step2() throws ClientProtocolException, IOException{println("======= Step 2 ======");HttpClient httpclient = new DefaultHttpClient();httpclient.getParams().setParameter(HttpMethodParams.USER_AGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");HttpGet get = null;HttpResponse httpResponse = null;File carserielistFile = new File(carserielistPath);FileOutputStream out = new FileOutputStream(carserielistFile);int sum = 0;String str;for(CarBrands carbs : cartree.getTree()){sum ++ ;if(debug){if(sum > this.MAX_DEBUG_LINE)break;}str = "Brand="+carbs.getName() + " \r\n";out.write(str.getBytes());//get series url with price urlget = new HttpGet( Item_URL + "typeId=1&brandId="+carbs.getBid()+"&fctId=0&seriesId=0");httpResponse = httpclient.execute(get);String htmlstring = EntityUtils.toString(httpResponse.getEntity());Document html = Jsoup.parse(htmlstring);Element curli = html.getElementById("b"+carbs.getBid());if(curli!=null){Elements aem = curli.select("dl dd a em");if(aem!=null)aem.remove();Elements series = curli.select("dl dd a");//系列println(carbs.getName() + " ");for(Element serie : series){String href = this.BASE_URL + serie.attr("href");String name = serie.text();CarSerie serieCarbrands= new CarSerie(name, href);//ADD SERIEcarbs.add(serieCarbrands);str = " serie="+serieCarbrands.getName()+" = " + serieCarbrands.getUrl() + "\r\n";print( str );out.write(str.getBytes());}}}out.close();get.releaseConnection();}SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");//获取各个品牌CarBrands的各个系列CarSerie的各个车型CarYear/配置Modelspublic boolean Step3() throws IOException {boolean bok = true;println("======= Step 3 ======");httpclient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000);//连接时间20shttpclient.getParams().setParameter(CoreConnectionPNames.SO_TIMEOUT, 60000);httpclient.getParams().setParameter(HttpMethodParams.USER_AGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");File configFile = new File(configFilePath);FileOutputStream cofigout = new FileOutputStream(configFile, true);File carlistfile = new File( this.DIR_ROOT+"/carlist.txt"); // "/carlist_"+sdf.format(new Date())+".txt"FileOutputStream out = new FileOutputStream(carlistfile, true);File exceptionLogFile = new File( exceptionLogFilePath ); // "/carlist_"+sdf.format(new Date())+".txt"FileOutputStream exceptionout = new FileOutputStream(exceptionLogFile, true);String tempstr="";String str = "";int sumcars = 0;int sumserie = 0;int sumcaryear = 0;int sum1 = 0;int sum2 = 0;int sum3 = 0;int sum4 = 0;boolean bcontinue = true;for(CarBrands carbs : cartree.getTree()){ //CarBrandsif(debug && sum1 > this.MAX_DEBUG_LINE){break;}if( carBrandsNumber>0 && sum1 < this.carBrandsNumber){println("carBrandsNumber: "+ sum1 +" < "+carBrandsNumber );sum1 ++;continue;}else{carBrandsNumber = 0;}this.saveConfig(sum1, sum2, sum4, sum3, cofigout);println(sum1+" : "+ carbs.getName());//品牌名称str = "brand=" + carbs.getName() + "\r\n" ;try {out.write( str.getBytes() );} catch (IOException e1) {tempstr = sdf.format(new Date())+"\r\n"+e1.getMessage();exceptionout.write( tempstr.getBytes());e1.printStackTrace();}File fcar = new File(DIR_ROOT + "/"+ carbs.getName());if(!fcar.exists())fcar.mkdirs();sum2 = 0;for(CarSerie serie : carbs.getSeries()){ // CarSerieif(debug && sum2 > MAX_DEBUG_LINE){break;}if( carSerieNumber>0 && sum2 < this.carSerieNumber){println(" carSerieNumber: "+ sum2 +" < "+carSerieNumber );sum2 ++;continue;}else{carSerieNumber = 0;}this.saveConfig(sum1, sum2, -1, -1, cofigout);print( " serie=" +serie.getName() );//系列名称str = " serie="+serie.getName() + " " ;try {out.write( str.getBytes() );} catch (IOException e1) {tempstr = sdf.format(new Date())+"\r\n"+e1.getMessage() + e1.getLocalizedMessage();exceptionout.write( tempstr.getBytes());e1.printStackTrace();}File fcarserie = new File( fcar.getAbsolutePath()+ "/"+ serie.getName());if(!fcarserie.exists())fcarserie.mkdirs();try {get = new HttpGet( serie.getUrl() );//点击系列链接,获取该系列详情httpResponse = httpclient.execute(get);String htmlstring = EntityUtils.toString( httpResponse.getEntity() );Document html = Jsoup.parse(htmlstring);//获取该系列的级别、车身结构、指导价、发动机、变速箱信息GetDetailSerie(serie, html);str = " [指导价: "+ serie.getPrice() + "," + serie.getExtInfo()+" ]";println(str);try{out.write( str.getBytes() );}catch(Exception e){tempstr = sdf.format(new Date())+"\r\n"+e.getMessage() + e.getLocalizedMessage();exceptionout.write( tempstr.getBytes());e.printStackTrace();}if(this.bGetModelDetail){//获取该系列的在售、停手、预售各个款式Elements brandtabs = html.select(".row .brandtab-cont .tab-nav ul li a"); //获取在售、停手、预售链接if(brandtabs!=null){for(Element brandtabItem : brandtabs){if(brandtabItem!=null){String brandtabhref = brandtabItem.attr("href");//在售、停手、预售链接println(" "+ brandtabItem.text());if(brandtabhref!=null && !brandtabhref.trim().isEmpty()){get = new HttpGet( this.BASE_URL + brandtabhref );//获取在售、停手、预售车辆信息httpResponse = httpclient.execute(get);String htmlstringBrand = EntityUtils.toString( httpResponse.getEntity() );Document htmlBrand = Jsoup.parse(htmlstringBrand);{Elements interval01List = htmlBrand.select("div.intervalcont .interval01 .interval01-list li ");sum3 = 0;for(Element interval : interval01List){Elements carsinfo = interval.select(".interval01-list-cars .infor-title a");if(carsinfo != null){CarYear caryear = new CarYear( carsinfo.text() );//款式名称if(debug && sum3 > MAX_DEBUG_LINE){break;}if( carYearNumber>0 && sum3 < this.carYearNumber){println(" carYearNumber: "+ sum3 +" < "+carYearNumber );sum3 ++;continue;}else{carYearNumber = 0;}print( " " +caryear.getName() );str = " "+caryear.getName() + " " ;try {out.write( str.getBytes() );//款式名称} catch (IOException e1) {tempstr = sdf.format(new Date())+"\r\n"+e1.getMessage() + e1.getLocalizedMessage();exceptionout.write( tempstr.getBytes());e1.printStackTrace();}CarModels carModel = new CarModels();{carModel.setName(caryear.getName());//款式名称//1Elements guidance = interval.select(".interval01-list-guidance .guidance-price");//指导价if(guidance != null)carModel.setPrice(guidance.text());//指导价//2carModel.setUrl(carsinfo.attr("href"));//根据此url获取配置信息GetDetailModel(carModel);//3 Elements related = interval.select(".interval01-list-related a[href^=/pic]");//图片if(related!=null){carModel.setImageurl( this.BASE_URL + related.attr("href") );println( " imageurl = "+carModel.getImageurl() );//load images File fcaryear= new File( fcarserie.getAbsolutePath()+ "/"+ caryear.getName());if(!fcaryear.exists())fcaryear.mkdirs();if(this.bDownloadImage){sum4 = SetImagesList(carModel.getImageurl(), carModel, fcaryear, cofigout, sum1, sum2, sum3);}else{this.saveConfig(sum1, sum2, 0, sum3, cofigout);}}str = ", 指导价="+carModel.getPrice() + " , 车身结构="+carModel.getStructure()+", 发动机="+carModel.getEngine()+", 变速箱="+carModel.getTransmission()+", imagepageurl="+carModel.getImageurl()+" \r\n" ; //级别="+carModel.getLevel()+",try{out.write( str.getBytes() );}catch(Exception e){tempstr = sdf.format(new Date())+"\r\n"+e.getMessage() + e.getLocalizedMessage();exceptionout.write( tempstr.getBytes());e.printStackTrace();}}caryear.add(carModel);serie.add(caryear);this.saveConfig(sum1, sum2, sum4, sum3, cofigout);}sum3 ++ ;}//end of for(Element interval : interval01List)}}}}}}} catch (ClientProtocolException e) {if(get!=null)get.releaseConnection();e.printStackTrace();tempstr = sdf.format(new Date())+"\r\n"+e.getMessage();exceptionout.write( tempstr.getBytes());return false;} catch (IOException e) {if(get!=null)get.releaseConnection();e.printStackTrace();tempstr = sdf.format(new Date())+"\r\n"+e.getMessage();exceptionout.write( tempstr.getBytes());return false;}sum2 ++;}sum1 ++;}if(get!=null)get.releaseConnection();try {cofigout.close();out.close();exceptionout.close();} catch (IOException e) {e.printStackTrace();return false;}return true;}//获取该系列的级别、车身结构、发动机、变速箱、指导价信息public void GetDetailSerie(CarSerie serie, Document html){Elements carCont = html.select(".car-cont .list-cont-main .main-lever");if(carCont!=null){Elements priceItem = carCont.select(".main-lever-right .lever-price");//指导价if(priceItem!=null)serie.setPrice( priceItem.text() ) ;//指导价Elements carcolors = carCont.select(".main-lever-left ul.lever-ul .lever-ul-color");//车身颜色if(carcolors!=null)carcolors.remove();Elements carExt= carCont.select(".main-lever-left ul.lever-ul li");//获取该系列的级别、车身结构、发动机、变速箱信息if(carExt!=null){serie.setExtInfoHtml(carExt.html());for(Element item : carExt){serie.addExtInfo(item.text());}}}}//根据此url获取配置信息public void GetDetailModel( CarModels carModel){if(carModel.getUrl()==null)return ;get = new HttpGet( carModel.getUrl() );try{httpResponse = httpclient.execute(get);String htmlstring = EntityUtils.toString(httpResponse.getEntity());Document html = Jsoup.parse(htmlstring);Elements cardetails = html.select(".cardetail-infor .cardetail-infor-car li");for(Element cardetail : cardetails){Elements em = cardetail.getElementsContainingText("车身尺寸");if(em!=null && em.size()>0){cardetail.select("span").remove();carModel.setSize(cardetail.text());continue;}em = cardetail.getElementsContainingText("车身结构");if(em!=null && em.size()>0){cardetail.select("span").remove();carModel.setStructure(cardetail.text());continue;}em = cardetail.getElementsContainingText("机");if(em!=null && em.size()>0){cardetail.select("span").remove();carModel.setEngine(cardetail.text());continue;}em = cardetail.getElementsContainingText("箱");if(em!=null && em.size()>0){cardetail.select("span").remove();carModel.setTransmission(cardetail.text());continue;}}}catch(Exception e){}}public int SetImagesList(String imagepage, CarModels carModel, File fcaryear, FileOutputStream cofigout,int sum1, int sum2, int sum3){int sum4 = 0;try{get = new HttpGet( imagepage );httpResponse = httpclient.execute(get);String htmlstring = EntityUtils.toString(httpResponse.getEntity());Document html = Jsoup.parse(htmlstring);Elements imagesElements = html.select(".row .column .uibox .uibox-con ul li>a"); //图片列表sum4 = 0;for(Element em : imagesElements){//遍历图片列表 liif(debug && sum4 > MAX_DEBUG_LINE){break;}if(this.carImagesNumber>0 && sum4 < this.carImagesNumber){println(" carImagesNumber: "+sum4 +" < "+carImagesNumber );sum4 ++;continue;}else{carImagesNumber = 0;}String imageName = carModel.getName()+"_"+sum4+".jpg";File storeFile = new File( fcaryear.getAbsolutePath() + "/" + imageName );if(storeFile.exists()){println("ignore exist file @ "+storeFile.getAbsolutePath());continue;}if(em!=null){String href = this.BASE_URL + em.attr("href");//获取图片浏览页面链接try{get = new HttpGet( href );httpResponse = httpclient.execute(get);//打开图片浏览页面String htmlstring2 = EntityUtils.toString(httpResponse.getEntity());Document html2 = Jsoup.parse(htmlstring2);Element img = html2.getElementById("img");//获取大图链接if(img!=null){CarSerieImage im = new CarSerieImage( carModel.getName(), img.attr("src"));carModel.add(im);print( " " +im.getTitle()+" img = "+im.getSrc() );downloadPhotos(im.getSrc(), fcaryear.getAbsolutePath(), imageName);//下载图片println("");}}catch(Exception e){e.printStackTrace();}}sum4 ++ ;this.saveConfig(sum1, sum2, sum4, sum3, cofigout);}}catch(Exception e){e.printStackTrace();}return sum4;}HttpClient httpclient = new DefaultHttpClient();HttpGet get = null;HttpResponse httpResponse = null;public void downloadPhotos (String url, String savePath, String saveNamge){httpclient.getParams().setParameter(HttpMethodParams.USER_AGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");try {get = new HttpGet( url );httpResponse = httpclient.execute(get);File storeFile = new File( savePath + "/" + saveNamge ); FileOutputStream output = new FileOutputStream(storeFile); //得到网络资源的字节数组,并写入文件 output.write( EntityUtils.toByteArray(httpResponse.getEntity()) ); output.close(); print( " saved image @ "+storeFile.getAbsolutePath() );} catch (Exception e) {e.printStackTrace();}}//save nams 2 carlist.txtpublic void Step4() throws ClientProtocolException, IOException{println("======= Step 4 save picture data ======");File froot = new File(DIR_ROOT);if(!froot.exists())froot.mkdirs();File carlistfile = new File(froot.getAbsolutePath()+"/carlist.txt");FileOutputStream out = new FileOutputStream(carlistfile);String str = "";int sumcars = 0;int sumserie = 0;int sumcaryear = 0;for(CarBrands carbs : cartree.getTree()){sumcars ++;str = carbs.getName() + "\r\n" ;out.write( str.getBytes() );for(CarSerie serie : carbs.getSeries()){sumserie ++;str = " "+serie.getName() + "\r\n" ;out.write( str.getBytes() );for(CarYear caryear : serie.getCarYearList()){sumcaryear ++;str = " "+caryear.getName() + "\r\n" ;out.write( str.getBytes() );for(CarModels carModel : caryear.getCarModels()){str = ", 指导价="+carModel.getPrice() + " , 车身结构="+carModel.getStructure()+", 发动机="+carModel.getEngine()+", 变速箱="+carModel.getTransmission()+", imagepageurl="+carModel.getImageurl()+" \r\n" ; //级别="+carModel.getLevel()+",out.write( str.getBytes() );}}}}out.close();println("sumcars = "+sumcars + " sumserie = "+sumserie);try {Thread.sleep(3000);} catch (InterruptedException e) {e.printStackTrace();}}boolean bstarted = false;public boolean start() throws ClientProtocolException, IOException{boolean bsuccess = false;long t1 = System.currentTimeMillis();if(bstarted==false){this.Step1();//获取品牌列表CarBrandsthis.Step2();//获取各个品牌CarBrands的各个系列CarSerie}bstarted = true;bsuccess = this.Step3();//获取各个品牌CarBrands的各个系列CarSerie的各个车型CarYear/配置Models// this.Step4();//存储文本数据和图片// this.Step5();//获取图片并存储图片long t2 = System.currentTimeMillis();long diff = (t2 -t1)/1000;long hour = diff/3600;long minite = (diff - hour*3600)/(60);long sec = diff % 60;println("start at "+sdf.format(new Date(t1)));println("end at "+sdf.format(new Date(t2)));println("it takes "+hour+" h "+minite+" m "+sec+" s ." );return bsuccess;}//get pictrue urlpublic void Step5(){println("======= Step 5 ======");HttpClient httpclient = new DefaultHttpClient();HttpGet get = null;HttpResponse httpResponse = null;httpclient.getParams().setParameter(HttpMethodParams.USER_AGENT, "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36");File froot = new File(DIR_ROOT);if(!froot.exists())froot.mkdirs();int sum = 0;for(CarBrands carbs : cartree.getTree()){println(carbs.getName());File fcar = new File(DIR_ROOT + "/"+ carbs.getName());if(!fcar.exists())fcar.mkdirs();sum ++ ;if(debug && sum > MAX_DEBUG_LINE){break;}int sum2 = 0;for(CarSerie serie : carbs.getSeries()){File fcarserie = new File( fcar.getAbsolutePath()+ "/"+ serie.getName());if(!fcarserie.exists())fcarserie.mkdirs();sum2++;if(debug && sum2 > MAX_DEBUG_LINE){break;}println( " " +serie.getName() );get = new HttpGet( serie.getUrl() );try{httpResponse = httpclient.execute(get);String htmlstring = EntityUtils.toString(httpResponse.getEntity());Document html = Jsoup.parse(htmlstring);Elements imagesElements = html.select(".row .column .uibox .uibox-con ul li>a"); //图片列表int sum3 = 0;for(Element em : imagesElements){sum3 ++ ;if(debug && sum3 > MAX_DEBUG_LINE){break;}if(em!=null){String href = this.BASE_URL + em.attr("href");get = new HttpGet( href );httpResponse = httpclient.execute(get);String htmlstring2 = EntityUtils.toString(httpResponse.getEntity());Document html2 = Jsoup.parse(htmlstring2);Element img = html2.getElementById("img");if(img!=null){CarSerieImage im = new CarSerieImage(em.attr("title"), img.attr("src"));//serie.add(im);print( " " +im.getTitle()+" img = "+im.getSrc() );downloadPhotos(im.getSrc(), fcarserie.getAbsolutePath(), im.getTitle()+"_"+sum3+".jpg");println("");}}}}catch(Exception e){e.printStackTrace();}}}get.releaseConnection();}public String getDIR_ROOT() {return DIR_ROOT;}public void setDIR_ROOT(String dIR_ROOT) {DIR_ROOT = dIR_ROOT;}public boolean isbDownloadImage() {return bDownloadImage;}public void setbDownloadImage(boolean bDownloadImage) {this.bDownloadImage = bDownloadImage;}public boolean isbGetModelDetail() {return bGetModelDetail;}public void setbGetModelDetail(boolean bGetModelDetail) {this.bGetModelDetail = bGetModelDetail;}
}
本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!
