利用爬虫爬取我的solo博客

Published on Mar 25, 2020 with 0 views and 0 comments

选项一直都有但我选择了你

之前写的对接飞书的接口发送消息，顺便爬取一下博客把文章 title 和 link 爬取出来集成到富文本中。

注意的是只能爬取 solo 的内置皮肤 pingsu 里面的文章

由于页脚去掉版权信息的原因，我又改了一版去识别当前 solo 使用的皮肤是哪两款……

import urllib.request
from lxml import etree
import re
class solo():
    def __init__(self,url):
        self.url = url
        self.Article_num = solo.get_ArticlesNums()
    def Theme(self):
        '''
        检测solo皮肤，为Casper的话返回值，否则为空字符串
        :return:
        '''
        response = urllib.request.urlopen(self.url)
        html = response.read().decode()
        theme = (re.search('Casper|Pinghsu', html)).group()
        return theme

    def get_casper(self):
        titles_list = []
        links_list = []
        for i in range(1, self.Article_num + 1):
            url_1 = "https://www.cjzshilong.cn/?p=%s" % i  # 分页数里面的链接
            response_1 = urllib.request.urlopen(url_1)
            html_1 = response_1.read().decode()
            parseHtml_1 = etree.HTML(html_1)
            article_num = len(parseHtml_1.xpath('//*[@id="pjax"]/div/div/article'))
            for n in range(1, article_num + 1):           # 遍历单个页数的文章标题
                titles = parseHtml_1.xpath('//*[@id="pjax"]/div/div/article[%s]/div/h2/a/text()' % n)
                links = parseHtml_1.xpath('//*[@id="pjax"]/div/div/article[%s]/div/h2/a//@href' % n)
                new_titles = ''.join(titles).strip().rsplit('\n')
                links_list.append(links)
                titles_list.append(new_titles)
        return titles_list, links_list
    @staticmethod
    def get_ArticlesNums():
        response = urllib.request.urlopen(url)
        html = response.read().decode()
        parseHtml = etree.HTML(html)
        all_article = len(parseHtml.xpath('//*[@id="pjax"]/div/nav/a')) + 1  # 全部分页数
        return all_article

if __name__ == '__main__':
    url = "https://www.cjzshilong.cn"
    info = solo(url=url)
    theme = info.Theme()
    if theme == 'Pinghsu':
        print('123')
    else:
        print('Casper')

对接飞书：

# !/bin/env python3
########################################################
#  This script is to send emails to Lijuan regularly   #
#  Date: 2020-2-24                                     #
#  Author: cuijianzhe                                  #
#  Email: 598941324@qq.com                             #
########################################################

import urllib.request
from lxml import etree
from bs4 import BeautifulSoup
import requests
import json
import os
import random
import re
class solo():
    def __init__(self,url):
        self.url = url
        self.Article_num = solo.get_ArticlesNums()
    def Theme(self):
        '''
        检测solo皮肤，为Casper或者Pinghsu的话返回对应的字符串
        :return:
        '''
        response = urllib.request.urlopen(self.url)
        html = response.read().decode()
        theme = (re.search('Casper|Pinghsu', html)).group()
        return theme

    def get_pingsu(self):
        titles_list = []
        links_list = []
        for i in range(1, self.Article_num + 1):
            url_1 = "https://www.cjzshilong.cn/?p=%s" % i  # 分页数里面的链接
            response_1 = urllib.request.urlopen(url_1)
            html_1 = response_1.read().decode()
            parseHtml_1 = etree.HTML(html_1)
            article_num = len(parseHtml_1.xpath('//*[@id="pjax"]/div/main/article'))
            for n in range(1, article_num + 1):
                titles = parseHtml_1.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a/text()' % n)
                links = parseHtml_1.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a//@href' % n)
                new_titles = ''.join(titles).strip().rsplit('\n')
                links_list.append(links)
                titles_list.append(new_titles)
        return titles_list, links_list
    def get_casper(self):
        titles_list = []
        links_list = []
        for i in range(1, self.Article_num + 1):
            url_1 = "https://www.cjzshilong.cn/?p=%s" % i  # 分页数里面的链接
            response_1 = urllib.request.urlopen(url_1)
            html_1 = response_1.read().decode()
            parseHtml_1 = etree.HTML(html_1)
            article_num = len(parseHtml_1.xpath('//*[@id="pjax"]/div/div/article'))
            for n in range(1, article_num + 1):
                titles = parseHtml_1.xpath('//*[@id="pjax"]/div/div/article[%s]/div/h2/a/text()' % n)
                links = parseHtml_1.xpath('//*[@id="pjax"]/div/div/article[%s]/div/h2/a//@href' % n)
                new_titles = ''.join(titles).strip().rsplit('\n')
                links_list.append(links)
                titles_list.append(new_titles)
        return titles_list, links_list

    @staticmethod
    def get_ArticlesNums():
        response = urllib.request.urlopen(url)
        html = response.read().decode()
        parseHtml = etree.HTML(html)
        # 遍历单个页数的文章标题
        #    for i in range(1,len(article_num)):
        #        titles = parseHtml.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a/text()'%i)
        all_article = len(parseHtml.xpath('//*[@id="pjax"]/div/nav/a')) + 1  # 全部分页数
        return all_article

class feishu():
    def __init__(self,mobile):
        self.mobile = mobile
        self.token = feishu.get_token()
    def getloverwords(self):
        texts = []
        for i in range(1, int(random.randint(3,83))):
            url = 'https://www.duanwenxue.com/huayu/tianyanmiyu/list_{}.html'.format(i)
            response = requests.get(url)
            texts.append(response.text)
        articles = []
        for text in texts:
            soup = BeautifulSoup(text, 'lxml')
            arttis = soup.find('div', class_='list-short-article').find_all('a', {'target': "_blank"})  # 寻找情话内容
            #  通过列表推导式以及for循环获取到每个a标签里面的text内容并通过strip去除空格
            articles.extend([arttis[i].text.strip() for i in range(len(arttis))])
        todaywords = articles[random.randint(0, len(articles) - 1)]  # 随机选取其中一条情话
        return todaywords
    def getuserid(self):
        headers_group = {
            "Authorization": "Bearer %s" % self.token,
            "Content-Type": "application/json"
        }
        try:
            userurl = "https://open.feishu.cn/open-apis/user/v1/batch_get_id?mobiles=%s" %self.mobile
            res_data = requests.get(url=userurl, headers=headers_group)
            code = json.loads(res_data.text).get('code')
            if code == 0:
                userid = json.loads(res_data.text)['data']['mobile_users'][self.mobile][0]['user_id']
                return userid
            else:
                error = json.loads(res_data.text).get('msg')
                print('请求出错：{}'.format(error))
        except:
            print('请求失败')

    def uploadimg(self):
        imgname = random.choice(os.listdir('/scripts/feishu/images'))
        with open("/scripts/feishu/images/%s" % imgname, 'rb') as p:
            image = p.read()
        imgurl = "https://open.feishu.cn/open-apis/image/v4/put/"
        headers = {"Authorization": "Bearer %s" % self.token}
        files = {
            'image': image
        }
        imgdata = {
            "image_type": "message"
        }
        resp = requests.post(url=imgurl, headers=headers, files=files, data=imgdata)
        os.remove('/scripts/feishu/images/%s' % imgname)
        resp.raise_for_status()
        content = resp.json()
        return content['data']['image_key']
    def sendmess(self,title, link, user_id, image_key=None):
        headers_group = {
            "Authorization": "Bearer %s" % self.token,
            "Content-Type": "application/json"
        }
        message_url = "https://open.feishu.cn/open-apis/message/v4/send/"
        # 发送富文本消息
        data = {
            "user_id": user_id,
            "msg_type": "post",
            "content": {
                "post": {
                    "zh_cn": {
                        "title": "表情包来了",
                        "content": [
                            [
                                {
                                    "tag": "text",
                                    "un_escape": True,
                                    "text": "%s&nbsp;" %self.getloverwords()
                                },
                            ],
                            [
                                {
                                    "tag": "text",
                                    "un_escape": True,
                                    "text": "博文推荐&nbsp;:"
                                },
                                {
                                    "tag": "a",
                                    "text": "%s" % title,
                                    "href": "%s" % link
                                },
                                {
                                    "tag": "at",
                                    "user_id": user_id

                                }
                            ],
                            [
                                {
                                    "tag": "img",
                                    "image_key": image_key,
                                    "width": 1200,
                                    "height": 1200
                                }
                            ]
                        ]
                    }
                }
            }
        }
        request = requests.post(url=message_url, headers=headers_group, json=data)
    @staticmethod  # 静态方法 类或实例均可调用
    def get_token():  # 改静态方法函数里不传入self 或 cls
        data = {"app_id":"cli_9exxxxd","app_secret":"YJJxxxxYUi"}
        headers = {"Content-Type": "application/json"}
        url_token = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal/"
        try:
            res = requests.post(url_token, json=data, headers=headers)
            if res.status_code == 200:
                token = (json.loads(res.text)).get('tenant_access_token')
                return token
        except:
            print('请求失败')

if __name__ == '__main__':
    url = "https://www.cjzshilong.cn"
    solo_info = solo(url=url)
    theme = solo_info.Theme()
    mobiles = ["186xxxx6142","178xxxx4553"]
    for mobile in mobiles:
        if theme == 'Casper':
            res_info = solo_info.get_casper()  #返回casper文章标题和链接信息
            num = int(random.randint(1,len(res_info[0])))
            title = ''.join(res_info[0][num])
            link = ''.join(res_info[1][num])
            feishu_res = feishu(mobile)
            userID = feishu_res.getuserid()
            imgkey = feishu_res.uploadimg()
            feishu_res.sendmess(title,link,userID,imgkey)
        else:
           res_info = solo_info.get_pingsu()
           num = int(random.randint(1,len(res_info[0])))
           title = ''.join(res_info[0][num])
           link = ''.join(res_info[1][num])
           feishu_res = feishu(mobile)
           userID = feishu_res.getuserid()
           imgkey = feishu_res.uploadimg()
           feishu_res.sendmess(title,link,userID,imgkey)

效果如下：

Casper 和 Pingsu 皮肤文章标题以及链接爬取(纯函数方法)

import urllib.request
from lxml import etree

class solo():
    def __init__(self,url):
        self.url = url
        self.Article_num = solo.get_ArticlesNums()
    def Theme(self):
        '''
        检测solo皮肤，为Casper的话对应值
        :return:
        '''
        response = urllib.request.urlopen(self.url)
        html = response.read().decode()
        parseHtml = etree.HTML(html)
        theme = parseHtml.xpath('/html/body/footer/div/div[1]/text()[5]')  # 全部分页数
        info = ''.join(''.join(theme).strip().rsplit('\n'))
        return info

    def get_pingsu(self):
        titles_list = []
        links_list = []
        for i in range(1, self.Article_num + 1):
            url_1 = "https://www.cjzshilong.cn/?p=%s" % i  # 分页数里面的链接
            response_1 = urllib.request.urlopen(url_1)
            html_1 = response_1.read().decode()
            parseHtml_1 = etree.HTML(html_1)
            article_num = len(parseHtml_1.xpath('//*[@id="pjax"]/div/main/article'))
            for n in range(1, article_num + 1):
                titles = parseHtml_1.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a/text()' % n)
                links = parseHtml_1.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a//@href' % n)
                new_titles = ''.join(titles).strip().rsplit('\n')
                links_list.append(links)
                titles_list.append(new_titles)
        return titles_list, links_list
    def get_casper(self):
        titles_list = []
        links_list = []
        for i in range(1, self.Article_num + 1):
            url_1 = "https://www.cjzshilong.cn/?p=%s" % i  # 分页数里面的链接
            response_1 = urllib.request.urlopen(url_1)
            html_1 = response_1.read().decode()
            parseHtml_1 = etree.HTML(html_1)
            article_num = len(parseHtml_1.xpath('//*[@id="pjax"]/div/div/article'))
            for n in range(1, article_num + 1):
                titles = parseHtml_1.xpath('//*[@id="pjax"]/div/div/article[%s]/div/h2/a/text()' % n)
                links = parseHtml_1.xpath('//*[@id="pjax"]/div/div/article[%s]/div/h2/a//@href' % n)
                new_titles = ''.join(titles).strip().rsplit('\n')
                links_list.append(links)
                titles_list.append(new_titles)
        return titles_list, links_list
    @staticmethod
    def get_ArticlesNums():
        response = urllib.request.urlopen(url)
        html = response.read().decode()
        parseHtml = etree.HTML(html)
        # 遍历单个页数的文章标题
        #    for i in range(1,len(article_num)):
        #        titles = parseHtml.xpath('//*[@id="pjax"]/div/main/article[%s]/div/div[5]/h2/a/text()'%i)
        all_article = len(parseHtml.xpath('//*[@id="pjax"]/div/nav/a')) + 1  # 全部分页数  
        return all_article

    def main(self):
        theme =  solo.Theme(self)

if __name__ == '__main__':
    url = "https://www.cjzshilong.cn"
    info = solo(url=url)
    theme = info.main()
    if theme is not "":
        res = info.get_casper()
    else:
        res = info.get_pingsu()

标题：利用爬虫爬取我的solo博客
作者：cuijianzhe
地址：https://cuijianzhe.github.io/articles/2020/03/25/1585126708916.html

选项一直都有但我选择了你
效果如下：
Casper 和 Pingsu 皮肤文章标题以及链接爬取(纯函数方法)

Share WeiBo Twitter QZone WeChat ← → ↑ ↓