简介:

最近写了一个小工具,用来抓取内涵段子、糗事百科等各种笑话网站的段子和图片,最后保存文本,并发布在微信公众号上。使用谷歌的 v8 做了一个脚本引擎,使用 c++ 的实现了笑话的统计和发布功能,用js实现了网页爬取分析的功能。这样 c++ 调用 v8 引擎,加载 js 脚本,就会爬取一系列的内容。

以下是网页爬取分析的内容,当然js的实现只是思路,用其他语言也是一样能实现。抓取的内容有:文章、图片地址、点赞数。

内涵段子网页抓取分析代码:

//内涵段子
//http://neihanshequ.com/
var webUrl = 'http://neihanshequ.com/';
var imageUrl = 'http://neihanshequ.com/pic/';
var index = 1;
var endIndex = 5;
var retVal =
{

success: false,
items: []

};

function getJoyFromOnePage(htmlData, requestParams)
{

var nCount = 0;
var bEndOnePage = false;
while(!bEndOnePage)
{
    var result = 
    {
        webname: 'NeiHanDuanzi',
        webid: '',
        type: '',
        context: '',
        pic_url: '',
        read_count: '',
        publish_time: '',
        best_comment: ''       
    }
     
    //check gif
    {
        var keyWords = '"is_gif":"';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            var gif = htmlData.substring(0, endIndex);
            if( gif == 1 )
            {
                alert('NeiHanDuanzi:url is a gif:' + result.pic_url);
                continue;
            }
        }
    }
     
    //webid
    {
        var keyWords = 'data-group-id="';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            result.webid = htmlData.substring(0, endIndex);
            alert('NeiHanDuanzi:webid-' + result.webid);
        }
        else
        {
            bEndOnePage = true;
            alert('NeiHanDuanzi:webid not find, page end.');
        }
    }
     
    //read_count
    {
        var keyWords = '<span class="digg">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('</span>');
            result.read_count = htmlData.substring(0, endIndex);
            //alert('NeiHanDuanzi:read_count-' + result.read_count);
        }
    }
     
    //context
    {
        var keyWords = 'data-text="';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            result.context = htmlData.substring(0, endIndex);
            //alert('NeiHanDuanzi:context-' + result.context);
        }
    }
     
    //pic_url
    {
        var keyWords = 'data-pic="';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            result.pic_url = htmlData.substring(0, endIndex);
        }
    }
     
    /*
    nCount++;
    if( nCount >= 20 )
    {
        break;
    }
    */
     
    retVal.items.push(result);
}
 
return;

}

function getJoyContextList( url, parametersString )
{

var parameters = eval("(" + parametersString + ")");
var requestParams =
{
    method: 'GET',
    version: 'HTTP/1.1',
    headers: {},
    scriptParamaters: parameters
};
 
requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
requestParams.headers['Cache-Control'] = 'no-cache';
requestParams.headers['Connection'] = 'keep-alive';
requestParams.headers['Cookie'] = 'uuid="w:0ef44d961a6d43c99dd81ecb51596731"; sessionid=57f633c63c5de5d0bc03cddb0c6ee166; tt_webid=5286193655; __utmt=1; csrftoken=d760789fbe1fc31edae4ac6c11c5a700; Hm_lvt_773f1a5aa45c642cf87eef671e4d3f6a=1438825221,1438939411,1440988068,1440996740; Hm_lpvt_773f1a5aa45c642cf87eef671e4d3f6a=1440996782; __utma=101886750.2017161997.1438825217.1440995644.1440996740.6; __utmb=101886750.5.10.1440996740; __utmc=101886750; __utmz=101886750.1440996740.6.4.utmcsr=haosou.com|utmccn=(organic)|utmcmd=organic|utmctr={b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}86{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}85{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E7{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}A4{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BE{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}8C{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BA';
requestParams.headers['Host'] = 'neihanshequ.com';
requestParams.headers['Pragma'] = 'no-cache';
requestParams.headers['Referer'] = 'http://neihanshequ.com/';
requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';

{
    //get web context
    var httpRspString = syncHttpRequest(webUrl, JSON.stringify(requestParams));
    var httpRsp = eval("(" + httpRspString + ")");
    if( !httpRsp || httpRsp.statusCode != 200 )
    {
        alert('NeiHanDuanzi: Request webUrl(' + webUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
        return JSON.stringify(retVal);
    }
     
    var htmlData = httpRsp.data;
    getJoyFromOnePage(htmlData, requestParams)
}
 
{
    //get image
    var httpRspString = syncHttpRequest(imageUrl, JSON.stringify(requestParams));
    var httpRsp = eval("(" + httpRspString + ")");
    if( !httpRsp || httpRsp.statusCode != 200 )
    {
        alert('NeiHanDuanzi: Request imageUrl(' + imageUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
        return JSON.stringify(retVal);
    }
     
    var htmlData = httpRsp.data;
    getJoyFromOnePage(htmlData, requestParams)
     
    //alert('NeiHanDuanzi: headers' + httpRsp.data);
}

 
retVal.success = true;
return JSON.stringify(retVal);

}
糗事百科网页抓取分析代码

//糗事百科
//http://www.qiushibaike.com/hot/page/{index}
var webUrl = 'http://www.qiushibaike.com/hot/page/';
var index = 1;
var endIndex = 5;
var retVal =
{

success: false,
items: []

};

function getJoyFromOnePage(htmlData)
{

var bEndOnePage = false;
while(!bEndOnePage)
{
    var result = 
    {
        webname: 'QiuShiBaiKe',
        webid: '',
        type: '',
        context: '',
        pic_url: '',
        read_count: '',
        publish_time: '',
        best_comment: ''       
    }
    //webid
    {
        var keyWords = 'qiushi_tag_';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf("'>");
            result.webid = htmlData.substring(0, endIndex);
            alert('QiuShiBaiKe:webid-' + result.webid);
        }
        else
        {
            bEndOnePage = true;
            alert('QiuShiBaiKe:webid not find, page end.');
        }
    }
    //context
    {
        var keyWords = '<div class="content">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf("<!");
            result.context = htmlData.substring(0, endIndex);
            //alert('QiuShiBaiKe:context-' + result.context);
        }
    }
     
    //pic_url
    {
        var keyWords = '<a href="/article/' + result.webid + '" target="_blank">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
             
            //get sub
            keyWords = '<img src="';
            startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                var endIndex = htmlData.indexOf('" alt="');
                result.pic_url = htmlData.substring(0, endIndex);
                //alert('QiuShiBaiKe:pic_url-' + result.pic_url);
            }
        }
    }
     
    //read_count
    {
        var keyWords = '<span class="stats-vote"><i class="number">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf("</i>");
            result.read_count = htmlData.substring(0, endIndex);
            //alert('QiuShiBaiKe:read_count-' + result.read_count);
        }
    }
     
    retVal.items.push(result);
}
 
return;

}

function getJoyContextList( url, parametersString )
{

var parameters = eval("(" + parametersString + ")");
var requestParams =
{
    method: 'GET',
    version: 'HTTP/1.1',
    headers: {},
    scriptParamaters: parameters
};
requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';
requestParams.headers['Host'] = 'www.qiushibaike.com';
requestParams.headers['Connection'] = 'keep-alive';
requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
 
for(var i = index; index <= endIndex; index++)
{
    var trueUrl = webUrl + index;
    var httpRspString = syncHttpRequest(trueUrl, JSON.stringify(requestParams));
    var httpRsp = eval("(" + httpRspString + ")");
    if( !httpRsp || httpRsp.statusCode != 200 )
    {
        alert('QiuShiBaiKe: Request trueUrl(' + trueUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
        return JSON.stringify(retVal);
    }
     
    var htmlData = httpRsp.data;
    getJoyFromOnePage(htmlData)
}
 
retVal.success = true;
return JSON.stringify(retVal);

}

标签: none

添加新评论