朱建国 发布的文章

简介:

最近写了一个小工具,用来抓取内涵段子、糗事百科等各种笑话网站的段子和图片,最后保存文本,并发布在微信公众号上。使用谷歌的 v8 做了一个脚本引擎,使用 c++ 的实现了笑话的统计和发布功能,用js实现了网页爬取分析的功能。这样 c++ 调用 v8 引擎,加载 js 脚本,就会爬取一系列的内容。

以下是网页爬取分析的内容,当然js的实现只是思路,用其他语言也是一样能实现。抓取的内容有:文章、图片地址、点赞数。

内涵段子网页抓取分析代码:

//内涵段子
//http://neihanshequ.com/
var webUrl = 'http://neihanshequ.com/';
var imageUrl = 'http://neihanshequ.com/pic/';
var index = 1;
var endIndex = 5;
var retVal =
{

success: false,
items: []

};

function getJoyFromOnePage(htmlData, requestParams)
{

var nCount = 0;
var bEndOnePage = false;
while(!bEndOnePage)
{
    var result = 
    {
        webname: 'NeiHanDuanzi',
        webid: '',
        type: '',
        context: '',
        pic_url: '',
        read_count: '',
        publish_time: '',
        best_comment: ''       
    }
     
    //check gif
    {
        var keyWords = '"is_gif":"';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            var gif = htmlData.substring(0, endIndex);
            if( gif == 1 )
            {
                alert('NeiHanDuanzi:url is a gif:' + result.pic_url);
                continue;
            }
        }
    }
     
    //webid
    {
        var keyWords = 'data-group-id="';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            result.webid = htmlData.substring(0, endIndex);
            alert('NeiHanDuanzi:webid-' + result.webid);
        }
        else
        {
            bEndOnePage = true;
            alert('NeiHanDuanzi:webid not find, page end.');
        }
    }
     
    //read_count
    {
        var keyWords = '<span class="digg">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('</span>');
            result.read_count = htmlData.substring(0, endIndex);
            //alert('NeiHanDuanzi:read_count-' + result.read_count);
        }
    }
     
    //context
    {
        var keyWords = 'data-text="';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            result.context = htmlData.substring(0, endIndex);
            //alert('NeiHanDuanzi:context-' + result.context);
        }
    }
     
    //pic_url
    {
        var keyWords = 'data-pic="';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            result.pic_url = htmlData.substring(0, endIndex);
        }
    }
     
    /*
    nCount++;
    if( nCount >= 20 )
    {
        break;
    }
    */
     
    retVal.items.push(result);
}
 
return;

}

function getJoyContextList( url, parametersString )
{

var parameters = eval("(" + parametersString + ")");
var requestParams =
{
    method: 'GET',
    version: 'HTTP/1.1',
    headers: {},
    scriptParamaters: parameters
};
 
requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
requestParams.headers['Cache-Control'] = 'no-cache';
requestParams.headers['Connection'] = 'keep-alive';
requestParams.headers['Cookie'] = 'uuid="w:0ef44d961a6d43c99dd81ecb51596731"; sessionid=57f633c63c5de5d0bc03cddb0c6ee166; tt_webid=5286193655; __utmt=1; csrftoken=d760789fbe1fc31edae4ac6c11c5a700; Hm_lvt_773f1a5aa45c642cf87eef671e4d3f6a=1438825221,1438939411,1440988068,1440996740; Hm_lpvt_773f1a5aa45c642cf87eef671e4d3f6a=1440996782; __utma=101886750.2017161997.1438825217.1440995644.1440996740.6; __utmb=101886750.5.10.1440996740; __utmc=101886750; __utmz=101886750.1440996740.6.4.utmcsr=haosou.com|utmccn=(organic)|utmcmd=organic|utmctr={b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}86{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}85{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E7{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}A4{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BE{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}8C{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BA';
requestParams.headers['Host'] = 'neihanshequ.com';
requestParams.headers['Pragma'] = 'no-cache';
requestParams.headers['Referer'] = 'http://neihanshequ.com/';
requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';

{
    //get web context
    var httpRspString = syncHttpRequest(webUrl, JSON.stringify(requestParams));
    var httpRsp = eval("(" + httpRspString + ")");
    if( !httpRsp || httpRsp.statusCode != 200 )
    {
        alert('NeiHanDuanzi: Request webUrl(' + webUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
        return JSON.stringify(retVal);
    }
     
    var htmlData = httpRsp.data;
    getJoyFromOnePage(htmlData, requestParams)
}
 
{
    //get image
    var httpRspString = syncHttpRequest(imageUrl, JSON.stringify(requestParams));
    var httpRsp = eval("(" + httpRspString + ")");
    if( !httpRsp || httpRsp.statusCode != 200 )
    {
        alert('NeiHanDuanzi: Request imageUrl(' + imageUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
        return JSON.stringify(retVal);
    }
     
    var htmlData = httpRsp.data;
    getJoyFromOnePage(htmlData, requestParams)
     
    //alert('NeiHanDuanzi: headers' + httpRsp.data);
}

 
retVal.success = true;
return JSON.stringify(retVal);

}
糗事百科网页抓取分析代码

//糗事百科
//http://www.qiushibaike.com/hot/page/{index}
var webUrl = 'http://www.qiushibaike.com/hot/page/';
var index = 1;
var endIndex = 5;
var retVal =
{

success: false,
items: []

};

function getJoyFromOnePage(htmlData)
{

var bEndOnePage = false;
while(!bEndOnePage)
{
    var result = 
    {
        webname: 'QiuShiBaiKe',
        webid: '',
        type: '',
        context: '',
        pic_url: '',
        read_count: '',
        publish_time: '',
        best_comment: ''       
    }
    //webid
    {
        var keyWords = 'qiushi_tag_';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf("'>");
            result.webid = htmlData.substring(0, endIndex);
            alert('QiuShiBaiKe:webid-' + result.webid);
        }
        else
        {
            bEndOnePage = true;
            alert('QiuShiBaiKe:webid not find, page end.');
        }
    }
    //context
    {
        var keyWords = '<div class="content">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf("<!");
            result.context = htmlData.substring(0, endIndex);
            //alert('QiuShiBaiKe:context-' + result.context);
        }
    }
     
    //pic_url
    {
        var keyWords = '<a href="/article/' + result.webid + '" target="_blank">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
             
            //get sub
            keyWords = '<img src="';
            startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                var endIndex = htmlData.indexOf('" alt="');
                result.pic_url = htmlData.substring(0, endIndex);
                //alert('QiuShiBaiKe:pic_url-' + result.pic_url);
            }
        }
    }
     
    //read_count
    {
        var keyWords = '<span class="stats-vote"><i class="number">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf("</i>");
            result.read_count = htmlData.substring(0, endIndex);
            //alert('QiuShiBaiKe:read_count-' + result.read_count);
        }
    }
     
    retVal.items.push(result);
}
 
return;

}

function getJoyContextList( url, parametersString )
{

var parameters = eval("(" + parametersString + ")");
var requestParams =
{
    method: 'GET',
    version: 'HTTP/1.1',
    headers: {},
    scriptParamaters: parameters
};
requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';
requestParams.headers['Host'] = 'www.qiushibaike.com';
requestParams.headers['Connection'] = 'keep-alive';
requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
 
for(var i = index; index <= endIndex; index++)
{
    var trueUrl = webUrl + index;
    var httpRspString = syncHttpRequest(trueUrl, JSON.stringify(requestParams));
    var httpRsp = eval("(" + httpRspString + ")");
    if( !httpRsp || httpRsp.statusCode != 200 )
    {
        alert('QiuShiBaiKe: Request trueUrl(' + trueUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
        return JSON.stringify(retVal);
    }
     
    var htmlData = httpRsp.data;
    getJoyFromOnePage(htmlData)
}
 
retVal.success = true;
return JSON.stringify(retVal);

}

功能描述:

主账号能输入物料信息(入库),子账号能输出物料信息(出库)。

显示入库和出库的记录。

显示当前剩余物料信息,并导出至excel

开发计划:

1、搭建JAVA SSH开发环境,完成简单的数据库注册 DEMO,并发布。–ok

2、设计数据库表。 ——–ok

3、设计功能及WEB操作界面 ——–ok

4、开发各个功能界面

a、站点管理修改 ———ok

b、物料分类管理 ———ok

c、代码整理 ——ok

d、用户与骑士站点对应关系 ——-ok

e、库存管理 ———-ok

f、BD 申请 ———-ok

g、站长批阅 ———-ok

h、管理员批阅 ———-ok

5、打包发布 ——-ok http://192.157.227.154:8080/Stock

6、测试程序。

a、逻辑优化

b、代码优化

名称:

双色球中奖邮件通知

功能:

1、每个开奖日后第二天以邮件方式通知是否中奖。

思路:

1、使用Python,跨平台,三方库功能强大

2、从网页上爬取当前的开奖号码 http://cp.360.cn/kj/ssq.html?agent=700007

3、与自己填写的号码检测是否匹配中奖

4、判断后发送邮件通知

5、每周2、4、7开奖,则每周3、5、1凌晨1点(可修改为上午十点最好)判断并发送。

-- coding: UTF-8 --

import smtplib
import urllib2
import string
import datetime
import time

from email.mime.text import MIMEText

from email.MIMEText import MIMEText
from email.Header import Header

我的号码

my_code_six = [1, 4, 8, 15, 17, 28]
my_code_one = 3

网页地址

web_url = "http://cp.360.cn/kj/ssq.html?agent=700007"

彩票开奖内容

lottery_date = ""
lottery_code_six = []
lottery_code_one = 0

号码匹配

winning_red_count = 0;
winning_blue_count = 0;

mailto_list=";"
mail_host="" #设置服务器
mail_user="" #用户名
mail_pass="" #口令
mail_postfix="163.com" #发件箱的后缀

def send_mail(to_list,sub,content):

global mailto_list
global mail_host
global mail_user
global mail_pass
global mail_postfix
     
me="中大奖"+"<"+mail_user+"@"+mail_postfix+">" 
msg = MIMEText(content,_subtype='plain',_charset='gb2312')  
msg['Subject'] = sub  
msg['From'] = me  
msg['To'] = to_list
try:  
    server = smtplib.SMTP()  
    server.connect(mail_host)  
    server.login(mail_user,mail_pass)  
    server.sendmail(me, to_list, msg.as_string())  
    server.close()  
    return True 
except Exception, e:  
    print str(e)  
    return False 

def check_lottery():

global lottery_date
global winning_red_count
global winning_blue_count
global lottery_code_six
global lottery_code_one
global my_code_one
global my_code_six
global web_url
 
winning_red_count = 0
winning_blue_count = 0
 
response = urllib2.urlopen(web_url)
html = response.read()
#print html
 
#date
key = "option value='"
nPos = html.index(key)
if nPos <= 0:
    return 0
html = html[nPos + len(key) : nPos + len(key) + 37]
print 'GET NEW RESULT:   ' + html
nPos = html.index("'")
if nPos <= 0:
    return 0
     
lottery_date = html[0 : nPos]
#print lottery_date
 
#first code
key = "code='"
nPos = html.index(key)
if nPos <= 0:
    return 0
html = html[nPos + len(key) : ]
nPos = html.index(" ")
if nPos <= 0:
    return 0
first_code = html[0 : nPos]
#print first_code
html = html[3 : ]
#print html
 
#second code
key = " "
nPos = html.index(key)
if nPos <= 0:
    return 0
 
second_code = html[0 : nPos]
#print second_code
html = html[3 : ]
#print html
 
#three code
key = " "
nPos = html.index(key)
if nPos <= 0:
    return 0
 
three_code = html[0 : nPos]
#print three_code
html = html[3 : ]
 
#four code
key = " "
nPos = html.index(key)
if nPos <= 0:
    return 0
 
four_code = html[0 : nPos]
#print four_code
html = html[3 : ]
 
#five code
key = " "
nPos = html.index(key)
if nPos <= 0:
    return 0
 
five_code = html[0 : nPos]
#print five_code
html = html[3 : ]
 
#six code
key = "+"
nPos = html.index(key)
if nPos <= 0:
    return 0
 
six_code = html[0 : nPos]
#print six_code
html = html[3 : ]
 
#one code
key = "'"
nPos = html.index(key)
if nPos <= 0:
    return 0
 
one_code = html[0 : nPos]
 
lottery_code_six.append(first_code)
lottery_code_six.append(second_code)
lottery_code_six.append(three_code)
lottery_code_six.append(four_code)
lottery_code_six.append(five_code)
lottery_code_six.append(six_code)
 
lottery_code_one = string.atoi(one_code)
 
for lottry_code in lottery_code_six:
    for my_code in my_code_six:
        if string.atoi(lottry_code) == my_code:
            winning_red_count = winning_red_count + 1
            break
if my_code_one == lottery_code_one:
    winning_blue_count = 1
    print "ok"
 
print my_code_one
print one_code
 
need_send_email = 0;
#blue more than 1
if winning_blue_count >= 1 :
    need_send_email = 1
     
#red more than 4
if winning_red_count >= 4:
    need_send_email = 1
     
print need_send_email
return need_send_email

def check_date():

date_vaule = datetime.datetime.now().weekday() + 1
print 'date_vaule:' + str(date_vaule)
if ((date_vaule != 1) and (date_vaule != 3) and (date_vaule != 5)):
    return 0
     
hour = time.strftime("{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}H", time.localtime()) 
minute = time.strftime("{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}M", time.localtime()) 
print 'hour:' + hour + '. minute:' + minute
if string.atoi(hour) != 1 or string.atoi(minute) != 0:
    return 0
 
print 'date_vaule:' + str(date_vaule) + '.hour:' + hour + '.minute:' + minute
return 1
 

if name == '__main__':


while True:
    if check_date() <= 0:
        print 'do noting'
        time.sleep(40)
         
    else:
        print "It's time to check lottery!!"
        result = check_lottery()
        print result
         
        send_message = "您的号码:"
        for my in my_code_six:
            send_message += str(my) + ","
        send_message += ":" + str(my_code_one) + "    "
         
        send_message += "开奖号码:"
         
        for lottery in lottery_code_six:
            send_message += str(lottery) + ','
        send_message += ":" + str(lottery_code_one)
         
        print send_message

        head = ""

        if  result > 0:
            head = "恭喜您中奖了,去看看吧"
             
        else :
            head = "下次一定能中的"
        print head

        if send_mail(mailto_list,head, send_message):  
                print "Send email ok" 
        else:  
            print "Send email faile"
        time.sleep(70)

简介

《潜意识2》

作者:高原

出版社:重庆出版社

目录摘要

1、心灵运行的规则

2、你比想象的聪明

3、你有多少坏情绪

4、积极能量产生的原理

5、其他 VIP 未读

内容摘要

1、有什么样的观念,就会督促你采取怎样的行动,而行为又会不知不觉中影响思维形成。所以每个人应该努力去塑造自己内心世界的宁静,平和,希望。

2、对一个人而言,最富有力量的是“当下”,而不是过去或者将来。你现在做的每一件事,都决定着你未来的样子。

3、几点行动:学会坚持,适时表现自己的好恶,定一个目标,有责任心,少发牢骚,接受意见,坦诚待人,有主见,多学习多阅历,谦和。

4、其他 VIP 未读

收获

1、潜意识和心理学一样,属于每个人每时每刻都存在的状态。

2、潜意识影响行为,俗话也说,性格决定命运。而潜意识跟性格相似,类似于心理学上的一种固态。

3、改变潜意识的方式。

冥想,关注呼吸,吸气时,起至眉心;呼气时,力沉丹田。身体力量随着呼吸而起伏变化。

祈祷,祈祷神明,祈福自我。

运动,属于冥想的一种动态方式,比如跑步中关注自己的身体,随时放松身上紧张的肌肉,关注自己呼吸的节奏,关注步伐的声音。

简介

《习惯的力量》

作 者:查尔斯·都希格(Charles Duhigg)著

出版社:中信出版社版

目录摘要

1、“白速得”与全民刷牙的习惯

2、意志力甚于好奇心

3、熟人社区与群体

4、弱联系的力量

内容摘要

1、用白速得广告案例来描述一个观点:广告其实也是让用户习惯的养成过程。

2、习惯养成的三个条件:暗示、激励、意志力。

a、暗示 + 激励是广告的秘诀。先暗示一种美好,在做完某个行为之后再给予激励。比如空气清新剂,暗示你使用后空气清新,在打扫屋子后喷一下,幸福感和成就感爆棚。

b、意志力,延迟满足是意志力培养的一种方式,举例,告知儿童不要吃巧克力,得知其意志力强弱,观察其数十年后的成就,得出意志力强的人更易取得大的成就。

c、意志力是可以会消耗的。比如先干各种杂事,然后再去做刻苦的事就难以坚持。

d、意志力是可以锻炼的。比如在一方面意志力增强之后,其他方面的意志力也会增强。养成一个好习惯后,其他习惯更容易养成。因为已经尝到甜头了,意志力会告诉你延迟满足会得到更多好处。
收获

1、习惯养成

激励,直接、有效、最快的激励。比如论坛、QQ、游戏的各种等级,比如空间、微信点赞。激励会让人更有成就感,甚至上瘾。

事务尽量拆分,保证每次都能做完并给予反馈。比如写这篇文章后发一篇博客,让我很有成就感。

2、意志力养成

延迟满足。运动,坚持快乐运动,坚信运动快乐,运动带来得是延迟后的快感。以此提高整体意志力。