|
来源:http://bbs.51cto.com/thread-1400050-1.html
预览源代码
打印
001 | var keywords = "黄焖鸡米饭"; |
005 | domains: ["dianping.com"], |
014 | selector: "//div[contains(@class,\'shop-review-wrap\')]/div/h3/a/text()" |
022 | selector: "//div[contains(@class,\'block raw-block')]/ul/li[1]/span" |
026 | selector: "//div[@class=\'breadcrumb\']/b[1]/a/span/text()", |
030 | name: "province_name", |
031 | selector: "//div[@class=\'breadcrumb\']/b[1]/a/span/text()" |
036 | configs.onProcessHelperUrl = function(url, content, site) { |
038 | for (var i = 0; i < urls.length; i++) { |
039 | site.addUrl(urls+"/editmember"); |
043 | site.addUrl(nextPage); |
044 | var result = /\d+$/.exec(nextPage); |
046 | var data = result[0]; |
047 | var count = nextPage.length-data.length; |
048 | var lll = nextPage.substr(0, count)+(parseInt(data)+1); |
049 | site.addUrl(nextPage.substr(0, count)+(parseInt(data)+1)); |
050 | site.addUrl(nextPage.substr(0, count)+(parseInt(data)+2)); |
056 | configs.afterExtractField = function(fieldName, data, page) { |
057 | if (fieldName == "id") { |
058 | var result = /\d+$/.exec(data); |
063 | else if (fieldName == "shop_name") { |
064 | if (data.indexOf("黄焖鸡米饭") == -1) { |
068 | else if (fieldName == "create_time") { |
069 | var result = /\d{2}-\d{2}-\d{2}$/.exec(data); |
070 | data = "20"+result[0]; |
072 | else if (fieldName == "province_name" || fieldName == "region_name") { |
073 | var position = data.indexOf("县"); |
074 | if (position != -1 && position < data.length -1) { |
075 | data = data.substr(0,position+1); |
077 | position = data.indexOf("市"); |
078 | if (position != -1 && position < data.length -1) { |
079 | data = data.substr(0,position+1); |
081 | data = data.replace("餐厅",""); |
082 | if (fieldName == "province_name") { |
083 | data = getProvinceNameByRegion(data); |
089 | configs.nextScanUrl = function(url) { |
090 | var num = /\/(\d+)\//.exec(url); |
091 | if (num && num[1] < 2323) { |
100 | var crawler = new Crawler(configs); |
爬虫脚本可以在神箭云爬虫框架上运行,对爬虫有兴趣的可以试试,欢迎来交流~
|
|