我已经在Node.js中编写了一个小脚本来抓取网页并获取一些链接。报废部分由Cheerio完成。我的代码在这里(简化了空间):
var request = require('request');
var cheerio = require('cheerio');
var base_url = 'http://www.naftemporiki.gr/finance/';
var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];
var company = {};
request(base_url + 'mtfCompanies', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
}
//console.log(mutuals); // 1st place
});
console.log(mutuals); // 2nd place
这里是有趣的部分:当我尝试从“ request”块中的“第一个位置”输出JSON文档时,它确实不错。一个例子在这里:
{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET),
companies:
[ { name: ' J.P. MORGAN ASSET MANAGEMENT',
link: 'mtfCompany?id=J.P.+MORGAN+ASSET+MANAGEMENT' },
{ name: ' BNP PARIBAS INVESTMENT PARTNERS',
link: 'mtfCompany?id=BNP+PARIBAS+INVESTMENT+PARTNERS' },
{ name: ' PICTET', link: 'mtfCompany?id=PICTET' },
{ name: ' ALLIANZ ΑΕΔΑΚ',
link: 'mtfCompany?id=ALLIANZ+%ce%91%ce%95%ce%94%ce%91%ce%9a' },
{ name: ' ALLIANZ ΑΕΔΑΚ (ΑΝΤΙΠΡ.)',
link: 'mtfCompany?id=ALLIANZ+%ce%91%ce%95%ce%94%ce%91%ce%9a+(%ce%91%ce%9d%ce%a4%ce%99%ce%a0%ce%a1.)' },
{ name: ' ALLIANZ ΕΛΛΑΣ Α.Ε.',
link: 'mtfCompany?id=ALLIANZ+%ce%95%ce%9b%ce%9b%ce%91%ce%a3+%ce%91.%ce%95.' }]}
当我尝试从ANY块之外并在执行结束时从“第二名”输出JSON文档时,这是我得到的:
{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET), companies: [] }
看起来JSON文档中的'companies'数组已清空。我怀疑'mutuals.companies = [];' 由于某种原因,该行再次被执行。
有人能帮忙吗?
更新1:
按照建议将我的代码更改为使用“ async.series ...”。这是更新的版本:
var request = require('request'),
async = require('async'),
cheerio = require('cheerio');
var base_url = 'http://www.naftemporiki.gr/finance/';
var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];
var company = {};
async.series([
function(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
}
});
callback(null, 'one');
},
function (callback) {
console.log(mutuals);
callback(null, 'two');
}
]);
仍然无法正常工作。仍然输出的JSON是这样的:
{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET), companies: [] }
您的“第二名”是在请求完成之前打印变量。
您的“第一名”之所以有效,是因为它位于请求的回调中。发出请求,提取数据,然后调用回调并成功打印。
这就是异步代码的工作方式。没有障碍。因此,当您发出请求时,节点将存储回调函数,以便它可以使用请求的结果执行代码。
更新1:
您更新的问题大致相同。在系列的第一个函数中,callback
在请求完成之前调用gets。如果将回调移动到传递给请求的函数中,则在请求完成后将调用它。
function(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
callback(null, 'one');
}
});
},
建议1
在具有回调的node.js中进行开发可以使您拥有深层的嵌套结构。不要让您的if语句使嵌套更糟。使用早期回报而不是更深层的嵌套。例子:
function(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if(error) return callback(error);
if(response.statusCode !== 200) return callback('status code not 200');
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
callback(null, 'one');
});
},
建议2
使用async
它可以通过使用命名函数来简化事情。例子:
var request = require('request'),
async = require('async'),
cheerio = require('cheerio');
var base_url = 'http://www.naftemporiki.gr/finance/';
var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];
var company = {};
function getPage(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if(error) return callback(error);
if(response.statusCode !== 200) return callback('status code not 200');
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
callback(null, 'one');
});
}
function printMutuals(callback) {
console.log(mutuals);
callback(null, 'two');
}
async.series([
getPage,
printMutuals
]);
本文收集自互联网,转载请注明来源。
如有侵权,请联系[email protected] 删除。
我来说两句