Skip to content

先声明一下,本次分享仅用于技术交流。

爬虫与反爬虫可谓相爱相杀。

背景

以前做的一个小项目需要用到电影票房的数据,市面上商业api也不太好用,最终选择了猫眼专业版。最近发现通过接口获取到的数据都是乱码,默念“mmp”,很清楚猫眼又升级了反爬虫技术。最近两年可以说见证了猫眼专业版反爬虫技术的全过程,也是一次比一次难应付。

分析

不同于以往,本次猫眼采用了对关键数据使用woff字体呈现的方案,通过爬虫获取到的数据都是.这样的,代表1.01。刚开始还异想天开的认为,还好也就10个数字,只要匹配好对应关系就大功告成。但是很快就被猫眼的技术打脸了,虽然woff字符只有10个,分别与10个数字对应,但是每次获取到的数据中woff字符都是不一样的,就像是每次都对10个数字随机加密,每次靠肉眼匹配对应关系简直就是痴人说梦。

探索

“遇到的任何困难都不是第一个碰到”,带着这样的信念,很快就在网上找到一些蛛丝马迹,网上有很多关于woff字符转成图片的技术实现。站在巨人的肩膀上,很自然的就想到了只要使用飞桨实现数字识别,就可以将woff字体转换为数字。

方案

  1. woff字体转图片
  2. 使用PaddleHub或PaddleOCR或baidu-aip实现数字识别,得到woff字符与数字的对应关系

扩展

  1. woff文件导出为xml文件,如font.xml
  2. woff文件中的字符保存为图片,如1.jpg


安装相关第三方包

python
# !pip install fontTools
# !pip install reportlab
# !pip install shapely pyclipper
# !pip install baidu-aip

引入相关包

python
import re
import os
import cv2
import time
import requests
import numpy as np
import paddlehub
from PIL import Image
from aip import AipOcr
from fontTools import ttLib
from fontTools.ttLib import TTFont
from fontTools.pens.basePen import BasePen
from reportlab.graphics.shapes import Path
from reportlab.lib import colors
from reportlab.graphics import renderPM
from reportlab.graphics.shapes import Group, Drawing, scale

实现Woff2Text类

python
# woff = Woff2Text(url, tp='aip', option)
# url为.woff文件链接
# tp代表数字识别的类型,aip代表baidu-aip,hub代表paddlehub,使用baidu-aip需要在option中配置APP_ID、API_KEY、SECRET_KEY参数
python
class ReportLabPen(BasePen):
 
    def __init__(self, glyphSet, path=None):
        BasePen.__init__(self, glyphSet)
        if path is None:
            path = Path()
        self.path = path
 
    def _moveTo(self, p):
        (x,y) = p
        self.path.moveTo(x,y)
 
    def _lineTo(self, p):
        (x,y) = p
        self.path.lineTo(x,y)
 
    def _curveToOne(self, p1, p2, p3):
        (x1,y1) = p1
        (x2,y2) = p2
        (x3,y3) = p3
        self.path.curveTo(x1, y1, x2, y2, x3, y3)
 
    def _closePath(self):
        self.path.closePath()
 

class Woff2Text:
    # woff格式字体转为文本

    def __init__(self, url, tp='aip', fmt="png", option=None):
        self.base_path = 'woff_img/'
        self.__fmt = fmt
        self.__url = url
        self.__tp = tp
        if tp == 'aip':
            if option:
                try:
                    APP_ID = option['APP_ID']
                    API_KEY = option['API_KEY']
                    SECRET_KEY = option['SECRET_KEY']
                    self.client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
                except:
                    raise Exception('请配置APP_ID、API_KEY、SECRET_KEY')
            else:
                raise Exception('请配置APP_ID、API_KEY、SECRET_KEY')
        else:
            self.ocr = paddlehub.Module(name="chinese_ocr_db_crnn_server")
        self.__create_path()
        self.__get_woff()
        self.__woff2image()
        self.__image2text()



    def __create_path(self):
        # 生成字体文件和转换后的图片存放路径
        name = self.__url.split('/')[-1]
        filedir = self.base_path + name.replace('.woff','')
        if not os.path.exists(self.base_path):
            os.mkdir(self.base_path)
        if not os.path.exists(filedir):
            os.mkdir(filedir)
        self.__woffPath = filedir + '/' + name
        imagespath = filedir + '/images'
        if not os.path.exists(imagespath):
            os.mkdir(imagespath)
        self.__imgPath = imagespath + '/'

    def get_imagedir(self):
        return self.__imgPath

    def __get_woff(self):
        text = requests.get(self.__url).content
        with open(self.__woffPath,'wb') as f:
            f.write(text)
            f.close()

    def __woff2image(self):
        # woff字体转图片
        font = TTFont(self.__woffPath)
        gs = font.getGlyphSet()
        glyphNames = font.getGlyphNames()
        glyphNames.remove('glyph00000')
        glyphNames.remove('x')

        for i in glyphNames:
            if i[0] == '.':#跳过'.notdef', '.null'
                continue
            
            g = gs[i]
            pen = ReportLabPen(gs, Path(fillColor=colors.black, strokeWidth=1))
            g.draw(pen)
            w, h = g.width, g.width
            g = Group(pen.path)
            g.translate(w, h*1.5)
            d = Drawing(w*3, h*4.5)
            d.add(g)
            imageFile = self.__imgPath+"/"+ i +"." + self.__fmt
            renderPM.drawToFile(d, imageFile, self.__fmt)

    def get_file_content(self, filePath):
        with open(filePath, 'rb') as fp:
            return fp.read()
    
    def __recognize_text(self, image_dir):
        # 图片识别文本
        img = Image.open(image_dir)
        w, h = img.size
        img = img.resize((int(w/10), int(h/10)))
        if self.__tp == 'aip':
            img.save('tmp.png', 'png')
            image = self.get_file_content('tmp.png')
            result = self.client.basicGeneral(image)
            return result['words_result']
        else:
            result = self.ocr.recognize_text(images=[np.array(img)])
            return result[0]['data']



    def __image2text(self):
        # 图片转文本
        nums_all = list(range(10))
        nums = []
        texts = {}
        for image_name in os.listdir(self.__imgPath):
            name = image_name.split('.')[0]
            image_dir = self.__imgPath + image_name
            data = self.__recognize_text(image_dir)
            
            if data:
                if self.__tp == 'aip':
                    num = data[0]['words']
                else:
                    num = data[0]['text']
                texts[name] = num
                nums.append(int(num))
            else:
                texts[name] = ''
        nums_loss = list(set(nums_all) - set(nums))
        if len(nums_loss) == 1:
            for k, v in texts.items():
                if not v:
                    texts[k] = str(nums_loss[0])
                    break
        self.words = texts
    
    def single_text(self, single_str):
        return self.words.get(single_str, '')

    def text(self, woff_str):
        nums = ''
        for t in woff_str.split(';'):
            key = t.upper().replace('&#X', 'uni')
            if '.' in key:
                num = self.single_text(key.replace('.', ''))
                if num:
                    num = '.' + num
            else:
                num = self.single_text(key)
            nums += num
        return nums

应用

python
header = {
    "Host": "piaofang.maoyan.com",
    "Referer": "http://piaofang.maoyan.com/dashboard",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
}
r = requests.get('http://piaofang.maoyan.com/dashboard-ajax?orderType=0', headers=header).json()
font_urls = re.findall('s3plus.meituan.net/v1/mss_[0-9a-z/]+.woff', r['fontStyle'])
url = 'http://' + font_urls[0] if font_urls else ''
# 使用baidu-aip需要自行申请参数
# APP_ID = ''
# API_KEY = ''
# SECRET_KEY = ''
# opt = {
#     "APP_ID": APP_ID,
#     "API_KEY": API_KEY,
#     "SECRET_KEY": SECRET_KEY
#     }
# woff = Woff2Text(url, option=opt)

# 使用paddlehub实现识别
woff = Woff2Text(url, tp='hub')
movies = []
for movie in r['movieList']['data']['list']:
    boxSplitUnit = movie['boxSplitUnit']
    boxSplitUnit['num'] = woff.text(boxSplitUnit['num'])
    movie['boxSplitUnit'] = boxSplitUnit
    splitBoxSplitUnit = movie['splitBoxSplitUnit']
    splitBoxSplitUnit['num'] = woff.text(splitBoxSplitUnit['num'])
    movie['splitBoxSplitUnit'] = splitBoxSplitUnit
    movies.append(movie)

print(movies)

飞桨PPDB——深度学习社区