分类 默认分类 下的文章

简介:

最近写了一个小工具,用来抓取内涵段子、糗事百科等各种笑话网站的段子和图片,最后保存文本,并发布在微信公众号上。使用谷歌的 v8 做了一个脚本引擎,使用 c++ 的实现了笑话的统计和发布功能,用js实现了网页爬取分析的功能。这样 c++ 调用 v8 引擎,加载 js 脚本,就会爬取一系列的内容。

以下是网页爬取分析的内容,当然js的实现只是思路,用其他语言也是一样能实现。抓取的内容有:文章、图片地址、点赞数。

内涵段子网页抓取分析代码:

//内涵段子
//http://neihanshequ.com/
var webUrl = 'http://neihanshequ.com/';
var imageUrl = 'http://neihanshequ.com/pic/';
var index = 1;
var endIndex = 5;
var retVal =
{

success: false,
items: []

};

function getJoyFromOnePage(htmlData, requestParams)
{

var nCount = 0;
var bEndOnePage = false;
while(!bEndOnePage)
{
    var result = 
    {
        webname: 'NeiHanDuanzi',
        webid: '',
        type: '',
        context: '',
        pic_url: '',
        read_count: '',
        publish_time: '',
        best_comment: ''       
    }
     
    //check gif
    {
        var keyWords = '"is_gif":"';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            var gif = htmlData.substring(0, endIndex);
            if( gif == 1 )
            {
                alert('NeiHanDuanzi:url is a gif:' + result.pic_url);
                continue;
            }
        }
    }
     
    //webid
    {
        var keyWords = 'data-group-id="';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            result.webid = htmlData.substring(0, endIndex);
            alert('NeiHanDuanzi:webid-' + result.webid);
        }
        else
        {
            bEndOnePage = true;
            alert('NeiHanDuanzi:webid not find, page end.');
        }
    }
     
    //read_count
    {
        var keyWords = '<span class="digg">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('</span>');
            result.read_count = htmlData.substring(0, endIndex);
            //alert('NeiHanDuanzi:read_count-' + result.read_count);
        }
    }
     
    //context
    {
        var keyWords = 'data-text="';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            result.context = htmlData.substring(0, endIndex);
            //alert('NeiHanDuanzi:context-' + result.context);
        }
    }
     
    //pic_url
    {
        var keyWords = 'data-pic="';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf('"');
            result.pic_url = htmlData.substring(0, endIndex);
        }
    }
     
    /*
    nCount++;
    if( nCount >= 20 )
    {
        break;
    }
    */
     
    retVal.items.push(result);
}
 
return;

}

function getJoyContextList( url, parametersString )
{

var parameters = eval("(" + parametersString + ")");
var requestParams =
{
    method: 'GET',
    version: 'HTTP/1.1',
    headers: {},
    scriptParamaters: parameters
};
 
requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
requestParams.headers['Cache-Control'] = 'no-cache';
requestParams.headers['Connection'] = 'keep-alive';
requestParams.headers['Cookie'] = 'uuid="w:0ef44d961a6d43c99dd81ecb51596731"; sessionid=57f633c63c5de5d0bc03cddb0c6ee166; tt_webid=5286193655; __utmt=1; csrftoken=d760789fbe1fc31edae4ac6c11c5a700; Hm_lvt_773f1a5aa45c642cf87eef671e4d3f6a=1438825221,1438939411,1440988068,1440996740; Hm_lpvt_773f1a5aa45c642cf87eef671e4d3f6a=1440996782; __utma=101886750.2017161997.1438825217.1440995644.1440996740.6; __utmb=101886750.5.10.1440996740; __utmc=101886750; __utmz=101886750.1440996740.6.4.utmcsr=haosou.com|utmccn=(organic)|utmcmd=organic|utmctr={b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}86{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}85{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B6{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}B5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E7{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}A4{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BE{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}E5{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}8C{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}BA';
requestParams.headers['Host'] = 'neihanshequ.com';
requestParams.headers['Pragma'] = 'no-cache';
requestParams.headers['Referer'] = 'http://neihanshequ.com/';
requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';

{
    //get web context
    var httpRspString = syncHttpRequest(webUrl, JSON.stringify(requestParams));
    var httpRsp = eval("(" + httpRspString + ")");
    if( !httpRsp || httpRsp.statusCode != 200 )
    {
        alert('NeiHanDuanzi: Request webUrl(' + webUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
        return JSON.stringify(retVal);
    }
     
    var htmlData = httpRsp.data;
    getJoyFromOnePage(htmlData, requestParams)
}
 
{
    //get image
    var httpRspString = syncHttpRequest(imageUrl, JSON.stringify(requestParams));
    var httpRsp = eval("(" + httpRspString + ")");
    if( !httpRsp || httpRsp.statusCode != 200 )
    {
        alert('NeiHanDuanzi: Request imageUrl(' + imageUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
        return JSON.stringify(retVal);
    }
     
    var htmlData = httpRsp.data;
    getJoyFromOnePage(htmlData, requestParams)
     
    //alert('NeiHanDuanzi: headers' + httpRsp.data);
}

 
retVal.success = true;
return JSON.stringify(retVal);

}
糗事百科网页抓取分析代码

//糗事百科
//http://www.qiushibaike.com/hot/page/{index}
var webUrl = 'http://www.qiushibaike.com/hot/page/';
var index = 1;
var endIndex = 5;
var retVal =
{

success: false,
items: []

};

function getJoyFromOnePage(htmlData)
{

var bEndOnePage = false;
while(!bEndOnePage)
{
    var result = 
    {
        webname: 'QiuShiBaiKe',
        webid: '',
        type: '',
        context: '',
        pic_url: '',
        read_count: '',
        publish_time: '',
        best_comment: ''       
    }
    //webid
    {
        var keyWords = 'qiushi_tag_';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf("'>");
            result.webid = htmlData.substring(0, endIndex);
            alert('QiuShiBaiKe:webid-' + result.webid);
        }
        else
        {
            bEndOnePage = true;
            alert('QiuShiBaiKe:webid not find, page end.');
        }
    }
    //context
    {
        var keyWords = '<div class="content">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf("<!");
            result.context = htmlData.substring(0, endIndex);
            //alert('QiuShiBaiKe:context-' + result.context);
        }
    }
     
    //pic_url
    {
        var keyWords = '<a href="/article/' + result.webid + '" target="_blank">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
             
            //get sub
            keyWords = '<img src="';
            startIndex = htmlData.indexOf(keyWords);
            if( startIndex > 0 )
            {
                htmlData = htmlData.substring((startIndex + keyWords.length));
                var endIndex = htmlData.indexOf('" alt="');
                result.pic_url = htmlData.substring(0, endIndex);
                //alert('QiuShiBaiKe:pic_url-' + result.pic_url);
            }
        }
    }
     
    //read_count
    {
        var keyWords = '<span class="stats-vote"><i class="number">';
        var startIndex = htmlData.indexOf(keyWords);
        if( startIndex > 0 )
        {
            htmlData = htmlData.substring((startIndex + keyWords.length));
            var endIndex = htmlData.indexOf("</i>");
            result.read_count = htmlData.substring(0, endIndex);
            //alert('QiuShiBaiKe:read_count-' + result.read_count);
        }
    }
     
    retVal.items.push(result);
}
 
return;

}

function getJoyContextList( url, parametersString )
{

var parameters = eval("(" + parametersString + ")");
var requestParams =
{
    method: 'GET',
    version: 'HTTP/1.1',
    headers: {},
    scriptParamaters: parameters
};
requestParams.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11';
requestParams.headers['Host'] = 'www.qiushibaike.com';
requestParams.headers['Connection'] = 'keep-alive';
requestParams.headers['Accept-Encoding'] = 'gzip, deflate, sdch';
requestParams.headers['Accept-Language'] = 'zh-CN,zh;q=0.8,en;q=0.6';
requestParams.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8';
 
for(var i = index; index <= endIndex; index++)
{
    var trueUrl = webUrl + index;
    var httpRspString = syncHttpRequest(trueUrl, JSON.stringify(requestParams));
    var httpRsp = eval("(" + httpRspString + ")");
    if( !httpRsp || httpRsp.statusCode != 200 )
    {
        alert('QiuShiBaiKe: Request trueUrl(' + trueUrl + ') failed return code:' + (httpRsp ? httpRsp.statusCode : 'undefined'));
        return JSON.stringify(retVal);
    }
     
    var htmlData = httpRsp.data;
    getJoyFromOnePage(htmlData)
}
 
retVal.success = true;
return JSON.stringify(retVal);

}

功能描述:

主账号能输入物料信息(入库),子账号能输出物料信息(出库)。

显示入库和出库的记录。

显示当前剩余物料信息,并导出至excel

开发计划:

1、搭建JAVA SSH开发环境,完成简单的数据库注册 DEMO,并发布。–ok

2、设计数据库表。 ——–ok

3、设计功能及WEB操作界面 ——–ok

4、开发各个功能界面

a、站点管理修改 ———ok

b、物料分类管理 ———ok

c、代码整理 ——ok

d、用户与骑士站点对应关系 ——-ok

e、库存管理 ———-ok

f、BD 申请 ———-ok

g、站长批阅 ———-ok

h、管理员批阅 ———-ok

5、打包发布 ——-ok http://192.157.227.154:8080/Stock

6、测试程序。

a、逻辑优化

b、代码优化

名称:

双色球中奖邮件通知

功能:

1、每个开奖日后第二天以邮件方式通知是否中奖。

思路:

1、使用Python,跨平台,三方库功能强大

2、从网页上爬取当前的开奖号码 http://cp.360.cn/kj/ssq.html?agent=700007

3、与自己填写的号码检测是否匹配中奖

4、判断后发送邮件通知

5、每周2、4、7开奖,则每周3、5、1凌晨1点(可修改为上午十点最好)判断并发送。

-- coding: UTF-8 --

import smtplib
import urllib2
import string
import datetime
import time

from email.mime.text import MIMEText

from email.MIMEText import MIMEText
from email.Header import Header

我的号码

my_code_six = [1, 4, 8, 15, 17, 28]
my_code_one = 3

网页地址

web_url = "http://cp.360.cn/kj/ssq.html?agent=700007"

彩票开奖内容

lottery_date = ""
lottery_code_six = []
lottery_code_one = 0

号码匹配

winning_red_count = 0;
winning_blue_count = 0;

mailto_list=";"
mail_host="" #设置服务器
mail_user="" #用户名
mail_pass="" #口令
mail_postfix="163.com" #发件箱的后缀

def send_mail(to_list,sub,content):

global mailto_list
global mail_host
global mail_user
global mail_pass
global mail_postfix
     
me="中大奖"+"<"+mail_user+"@"+mail_postfix+">" 
msg = MIMEText(content,_subtype='plain',_charset='gb2312')  
msg['Subject'] = sub  
msg['From'] = me  
msg['To'] = to_list
try:  
    server = smtplib.SMTP()  
    server.connect(mail_host)  
    server.login(mail_user,mail_pass)  
    server.sendmail(me, to_list, msg.as_string())  
    server.close()  
    return True 
except Exception, e:  
    print str(e)  
    return False 

def check_lottery():

global lottery_date
global winning_red_count
global winning_blue_count
global lottery_code_six
global lottery_code_one
global my_code_one
global my_code_six
global web_url
 
winning_red_count = 0
winning_blue_count = 0
 
response = urllib2.urlopen(web_url)
html = response.read()
#print html
 
#date
key = "option value='"
nPos = html.index(key)
if nPos <= 0:
    return 0
html = html[nPos + len(key) : nPos + len(key) + 37]
print 'GET NEW RESULT:   ' + html
nPos = html.index("'")
if nPos <= 0:
    return 0
     
lottery_date = html[0 : nPos]
#print lottery_date
 
#first code
key = "code='"
nPos = html.index(key)
if nPos <= 0:
    return 0
html = html[nPos + len(key) : ]
nPos = html.index(" ")
if nPos <= 0:
    return 0
first_code = html[0 : nPos]
#print first_code
html = html[3 : ]
#print html
 
#second code
key = " "
nPos = html.index(key)
if nPos <= 0:
    return 0
 
second_code = html[0 : nPos]
#print second_code
html = html[3 : ]
#print html
 
#three code
key = " "
nPos = html.index(key)
if nPos <= 0:
    return 0
 
three_code = html[0 : nPos]
#print three_code
html = html[3 : ]
 
#four code
key = " "
nPos = html.index(key)
if nPos <= 0:
    return 0
 
four_code = html[0 : nPos]
#print four_code
html = html[3 : ]
 
#five code
key = " "
nPos = html.index(key)
if nPos <= 0:
    return 0
 
five_code = html[0 : nPos]
#print five_code
html = html[3 : ]
 
#six code
key = "+"
nPos = html.index(key)
if nPos <= 0:
    return 0
 
six_code = html[0 : nPos]
#print six_code
html = html[3 : ]
 
#one code
key = "'"
nPos = html.index(key)
if nPos <= 0:
    return 0
 
one_code = html[0 : nPos]
 
lottery_code_six.append(first_code)
lottery_code_six.append(second_code)
lottery_code_six.append(three_code)
lottery_code_six.append(four_code)
lottery_code_six.append(five_code)
lottery_code_six.append(six_code)
 
lottery_code_one = string.atoi(one_code)
 
for lottry_code in lottery_code_six:
    for my_code in my_code_six:
        if string.atoi(lottry_code) == my_code:
            winning_red_count = winning_red_count + 1
            break
if my_code_one == lottery_code_one:
    winning_blue_count = 1
    print "ok"
 
print my_code_one
print one_code
 
need_send_email = 0;
#blue more than 1
if winning_blue_count >= 1 :
    need_send_email = 1
     
#red more than 4
if winning_red_count >= 4:
    need_send_email = 1
     
print need_send_email
return need_send_email

def check_date():

date_vaule = datetime.datetime.now().weekday() + 1
print 'date_vaule:' + str(date_vaule)
if ((date_vaule != 1) and (date_vaule != 3) and (date_vaule != 5)):
    return 0
     
hour = time.strftime("{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}H", time.localtime()) 
minute = time.strftime("{b75a474a571334bb08f4db31fa80d7688c6401b1dcf97fb55e06ed241b59472c}M", time.localtime()) 
print 'hour:' + hour + '. minute:' + minute
if string.atoi(hour) != 1 or string.atoi(minute) != 0:
    return 0
 
print 'date_vaule:' + str(date_vaule) + '.hour:' + hour + '.minute:' + minute
return 1
 

if name == '__main__':


while True:
    if check_date() <= 0:
        print 'do noting'
        time.sleep(40)
         
    else:
        print "It's time to check lottery!!"
        result = check_lottery()
        print result
         
        send_message = "您的号码:"
        for my in my_code_six:
            send_message += str(my) + ","
        send_message += ":" + str(my_code_one) + "    "
         
        send_message += "开奖号码:"
         
        for lottery in lottery_code_six:
            send_message += str(lottery) + ','
        send_message += ":" + str(lottery_code_one)
         
        print send_message

        head = ""

        if  result > 0:
            head = "恭喜您中奖了,去看看吧"
             
        else :
            head = "下次一定能中的"
        print head

        if send_mail(mailto_list,head, send_message):  
                print "Send email ok" 
        else:  
            print "Send email faile"
        time.sleep(70)

赛事简况:
1、原计划4小时30分,实际5小时04分,跑步中的苦和煎熬要远远比想象中的艰难。
2、10、20、25、30公里吃能量棒,每隔2.5公里都要补水和饮料,喝了大约4~5瓶
3、前半程用时2小时05分,力量充足,以为可以顺利完赛;26公里上个厕所,再次起跑时左膝旧伤突然复发。
4、带伤坚持到32公里,用时3小时14分左右,果断选择走路回去,后10公里走路回去。总计耗时5小时04分
5、前半程遇到一位老大爷,66岁,配速5分30秒,误差不过3秒,不补充水,太牛了,我跟着跑了前半程,半程后,我掉速,就再没追上这位大爷。

赛事总结:
1、完赛就是胜利。
2、平时训练不足,没有30+以上的拉练,直接导致后半程崩盘。
3、家人玩的都很开心,特别是儿子一路开启自动欢乐模式,这也是很大的收获。
4、半年之内调整修养,备战下一场马拉松:2016郑开马拉松。

赛外小事:
1、12号从酒店去体育中心领取装备,发生了一些列奇葩的事情。走20分钟找公交站,却只坐5分钟下车。领完装备计划去大型超市采购第二天的食物,用百度地图导航找了一个名字很霸气超市,佳乐家第99连锁超市,跟着导航走了20分钟,却发现只是一个小卖铺,坑爹啊!然后继续导航找超市,往北走了20分钟又找到一个超市,却是在长途车站旁边,而车站附近卖东西都比较坑,于是就决定不买了!等准备坐车会酒店的时候,发现酒店就离我们有一个路口了,然后走路回去了!!!
2、跑步接近终点的时候,听见父亲和老婆在路口跟着我的名字喊加油。听着他们喊加油,我很激动,但是由于腿伤在身,我还是挥了挥手,继续拖着步子走向终点,内心各种滋味。
3、米粒一路上经常是看着路旁的景色或者是路上的影子自己咿咿呀呀的开心的不得了。

好久没来刷题了,今天刷了一道最简单的,找找感觉,一次编译通过的感觉太棒了。

题目:

You are playing the following Nim Game with your friend: There is a heap of stones on the table, each time one of you take turns to remove 1 to 3 stones. The one who removes the last stone will be the winner. You will take the first turn to remove the stones.

Both of you are very clever and have optimal strategies for the game. Write a function to determine whether you can win the game given the number of stones in the heap.

For example, if there are 4 stones in the heap, then you will never win the game: no matter 1, 2, or 3 stones you remove, the last stone will always be removed by your friend.

Hint:

If there are 5 stones in the heap, could you figure out a way to remove the stones such that you will always be the winner?
思路:

按照提示,谁最后拿的时候剩下四个子必输。

class Solution
{
public:

bool canWinNim(int n) 
{
    if(n <=0 )
     return false;
     
    return (n%4 == 0 ? false : true);
}

};
总结:

1、好久没刷LeetCode了,今天刷刷找找感觉。

2、还是思路,思路对了,代码就是对的。

3、一次通过,爽!不过这道题确实太简单了。