先声明一下,本次分享仅用于技术交流。
爬虫与反爬虫可谓相爱相杀。
背景
以前做的一个小项目需要用到电影票房的数据,市面上商业api也不太好用,最终选择了猫眼专业版。最近发现通过接口获取到的数据都是乱码,默念“mmp”,很清楚猫眼又升级了反爬虫技术。最近两年可以说见证了猫眼专业版反爬虫技术的全过程,也是一次比一次难应付。
分析
不同于以往,本次猫眼采用了对关键数据使用woff字体呈现的方案,通过爬虫获取到的数据都是.
这样的,代表1.01。刚开始还异想天开的认为,还好也就10个数字,只要匹配好对应关系就大功告成。但是很快就被猫眼的技术打脸了,虽然woff字符只有10个,分别与10个数字对应,但是每次获取到的数据中woff字符都是不一样的,就像是每次都对10个数字随机加密,每次靠肉眼匹配对应关系简直就是痴人说梦。
探索
“遇到的任何困难都不是第一个碰到”,带着这样的信念,很快就在网上找到一些蛛丝马迹,网上有很多关于woff字符转成图片的技术实现。站在巨人的肩膀上,很自然的就想到了只要使用飞桨实现数字识别,就可以将woff字体转换为数字。
方案
- woff字体转图片
- 使用PaddleHub或PaddleOCR或baidu-aip实现数字识别,得到woff字符与数字的对应关系
扩展
- woff文件导出为xml文件,如
font.xml
- woff文件中的字符保存为图片,如
1.jpg
安装相关第三方包
python
# !pip install fontTools
# !pip install reportlab
# !pip install shapely pyclipper
# !pip install baidu-aip
引入相关包
python
import re
import os
import cv2
import time
import requests
import numpy as np
import paddlehub
from PIL import Image
from aip import AipOcr
from fontTools import ttLib
from fontTools.ttLib import TTFont
from fontTools.pens.basePen import BasePen
from reportlab.graphics.shapes import Path
from reportlab.lib import colors
from reportlab.graphics import renderPM
from reportlab.graphics.shapes import Group, Drawing, scale
实现Woff2Text类
python
# woff = Woff2Text(url, tp='aip', option)
# url为.woff文件链接
# tp代表数字识别的类型,aip代表baidu-aip,hub代表paddlehub,使用baidu-aip需要在option中配置APP_ID、API_KEY、SECRET_KEY参数
python
class ReportLabPen(BasePen):
def __init__(self, glyphSet, path=None):
BasePen.__init__(self, glyphSet)
if path is None:
path = Path()
self.path = path
def _moveTo(self, p):
(x,y) = p
self.path.moveTo(x,y)
def _lineTo(self, p):
(x,y) = p
self.path.lineTo(x,y)
def _curveToOne(self, p1, p2, p3):
(x1,y1) = p1
(x2,y2) = p2
(x3,y3) = p3
self.path.curveTo(x1, y1, x2, y2, x3, y3)
def _closePath(self):
self.path.closePath()
class Woff2Text:
# woff格式字体转为文本
def __init__(self, url, tp='aip', fmt="png", option=None):
self.base_path = 'woff_img/'
self.__fmt = fmt
self.__url = url
self.__tp = tp
if tp == 'aip':
if option:
try:
APP_ID = option['APP_ID']
API_KEY = option['API_KEY']
SECRET_KEY = option['SECRET_KEY']
self.client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
except:
raise Exception('请配置APP_ID、API_KEY、SECRET_KEY')
else:
raise Exception('请配置APP_ID、API_KEY、SECRET_KEY')
else:
self.ocr = paddlehub.Module(name="chinese_ocr_db_crnn_server")
self.__create_path()
self.__get_woff()
self.__woff2image()
self.__image2text()
def __create_path(self):
# 生成字体文件和转换后的图片存放路径
name = self.__url.split('/')[-1]
filedir = self.base_path + name.replace('.woff','')
if not os.path.exists(self.base_path):
os.mkdir(self.base_path)
if not os.path.exists(filedir):
os.mkdir(filedir)
self.__woffPath = filedir + '/' + name
imagespath = filedir + '/images'
if not os.path.exists(imagespath):
os.mkdir(imagespath)
self.__imgPath = imagespath + '/'
def get_imagedir(self):
return self.__imgPath
def __get_woff(self):
text = requests.get(self.__url).content
with open(self.__woffPath,'wb') as f:
f.write(text)
f.close()
def __woff2image(self):
# woff字体转图片
font = TTFont(self.__woffPath)
gs = font.getGlyphSet()
glyphNames = font.getGlyphNames()
glyphNames.remove('glyph00000')
glyphNames.remove('x')
for i in glyphNames:
if i[0] == '.':#跳过'.notdef', '.null'
continue
g = gs[i]
pen = ReportLabPen(gs, Path(fillColor=colors.black, strokeWidth=1))
g.draw(pen)
w, h = g.width, g.width
g = Group(pen.path)
g.translate(w, h*1.5)
d = Drawing(w*3, h*4.5)
d.add(g)
imageFile = self.__imgPath+"/"+ i +"." + self.__fmt
renderPM.drawToFile(d, imageFile, self.__fmt)
def get_file_content(self, filePath):
with open(filePath, 'rb') as fp:
return fp.read()
def __recognize_text(self, image_dir):
# 图片识别文本
img = Image.open(image_dir)
w, h = img.size
img = img.resize((int(w/10), int(h/10)))
if self.__tp == 'aip':
img.save('tmp.png', 'png')
image = self.get_file_content('tmp.png')
result = self.client.basicGeneral(image)
return result['words_result']
else:
result = self.ocr.recognize_text(images=[np.array(img)])
return result[0]['data']
def __image2text(self):
# 图片转文本
nums_all = list(range(10))
nums = []
texts = {}
for image_name in os.listdir(self.__imgPath):
name = image_name.split('.')[0]
image_dir = self.__imgPath + image_name
data = self.__recognize_text(image_dir)
if data:
if self.__tp == 'aip':
num = data[0]['words']
else:
num = data[0]['text']
texts[name] = num
nums.append(int(num))
else:
texts[name] = ''
nums_loss = list(set(nums_all) - set(nums))
if len(nums_loss) == 1:
for k, v in texts.items():
if not v:
texts[k] = str(nums_loss[0])
break
self.words = texts
def single_text(self, single_str):
return self.words.get(single_str, '')
def text(self, woff_str):
nums = ''
for t in woff_str.split(';'):
key = t.upper().replace('&#X', 'uni')
if '.' in key:
num = self.single_text(key.replace('.', ''))
if num:
num = '.' + num
else:
num = self.single_text(key)
nums += num
return nums
应用
python
header = {
"Host": "piaofang.maoyan.com",
"Referer": "http://piaofang.maoyan.com/dashboard",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
}
r = requests.get('http://piaofang.maoyan.com/dashboard-ajax?orderType=0', headers=header).json()
font_urls = re.findall('s3plus.meituan.net/v1/mss_[0-9a-z/]+.woff', r['fontStyle'])
url = 'http://' + font_urls[0] if font_urls else ''
# 使用baidu-aip需要自行申请参数
# APP_ID = ''
# API_KEY = ''
# SECRET_KEY = ''
# opt = {
# "APP_ID": APP_ID,
# "API_KEY": API_KEY,
# "SECRET_KEY": SECRET_KEY
# }
# woff = Woff2Text(url, option=opt)
# 使用paddlehub实现识别
woff = Woff2Text(url, tp='hub')
movies = []
for movie in r['movieList']['data']['list']:
boxSplitUnit = movie['boxSplitUnit']
boxSplitUnit['num'] = woff.text(boxSplitUnit['num'])
movie['boxSplitUnit'] = boxSplitUnit
splitBoxSplitUnit = movie['splitBoxSplitUnit']
splitBoxSplitUnit['num'] = woff.text(splitBoxSplitUnit['num'])
movie['splitBoxSplitUnit'] = splitBoxSplitUnit
movies.append(movie)
print(movies)