天天看点

解决猫眼自定义字体的问题-获取影院实时电影拍片或实时票房等

# 思路(字体是不变的,但是字符编码会改变,动态解析字体文件)
# 1.爬取准备,获取网页的字体库,使用软件得到相对应数字或汉字对应的编码
# 2.在爬取网页的时候,正常获取出需要解码的数字或汉字的编码,
# 3.然后利用fontTools.ttLib来对照解析字体库(动态将获取的woff的字体文--      
--件通过库转换成otf格式)
# 4.最后获取出需要解码的部分,在替换成对应的字体,然后大功告成,拼接我们需要的字段      

上面是整体思路,如果想看详细介绍,可以查看我前2篇转载的文章

import requests
from lxml import html
import re
import woff2otf
from fontTools.ttLib import TTFont
from bs4 import BeautifulSoup as bs

class MaoyanSpider:
    #页面初始化
    def __init__(self):
        self.headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1",
            # "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36"
        }
    # 获取票房
    def getNote(self):
        url = "http://maoyan.com/cinema/15887?poi=91871213"
        host = {'Host':'maoyan.com',
                'refer':'http://maoyan.com/news',}
        headers = dict(self.headers)
        headers.update(host)
        # 获取页面内容
        r = requests.get(url, headers=headers)
        #print r.text
        response = html.fromstring(r.text)
        u = r.text
        # 匹配ttf font
        cmp = re.compile(",\n           url\('(//.*.woff)'\) format\('woff'\)")
        rst = cmp.findall(r.text)
        ttf = requests.get("http:" + rst[0], stream=True)
        with open("maoyan.woff", "wb") as pdf:
            for chunk in ttf.iter_content(chunk_size=1024):
                if chunk:
                    pdf.write(chunk)
        # 转换woff字体为otf字体
        woff2otf.convert('maoyan.woff', 'maoyan.otf')
        # 解析字体库font文件
        baseFont = TTFont('base.otf')
        maoyanFont = TTFont('maoyan.otf')
        uniList = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()
        numList = []
        baseNumList = ['.', '3', '5', '1', '2', '7', '0', '6', '9', '8', '4']
        baseUniCode = ['x', 'uniE64B', 'uniE183', 'uniED06', 'uniE1AC', 'uniEA2D', 'uniEBF8',
        'uniE831', 'uniF654', 'uniF25B', 'uniE3EB']
        for i in range(1, 12):
            maoyanGlyph = maoyanFont['glyf'][uniList[i]]
            for j in range(11):
                baseGlyph = baseFont['glyf'][baseUniCode[j]]
                if maoyanGlyph == baseGlyph:
                    numList.append(baseNumList[j])
                    break
        uniList[1] = 'uni0078'
        utf8List = [eval("u'\\u" + uni[3:] + "'").encode("utf-8") for uni in uniList[1:]]
        # 获取发帖内容
        soup = bs(u,"html.parser")
        index=soup.find_all('div', {'class': 'show-list'})

        print ('---------------Prices-----------------')
        # print(len(index))
        # print(soup.find_all('h3', {'class': 'movie-name'}))
        mn = soup.find_all('h3', {'class': 'movie-name'})
        ting = soup.find_all('span', {'class': 'hall'})
        mt = soup.find_all('span', {'class': 'begin-time'})
        mw = soup.find_all('span', {'class': 'stonefont'})
        for n in range(len(index)):
            # 13部电影开头
            moviename = mn[n].get_text()
            for i in range(len(mt)):
                film_ting = ting[i].get_text()
                movietime=mt[i].get_text()
                moviewish=mw[i].get_text()
                for i in range(len(utf8List)):
                    # print(utf8List[i])
                    moviewish = moviewish.replace(utf8List[i].decode(), numList[i])
                print(moviename,film_ting,movietime,moviewish)
spider = MaoyanSpider()
spider.getNote()      
当我们破解了自定义字符并替换或,就可以正常存储或输出      
解决猫眼自定义字体的问题-获取影院实时电影拍片或实时票房等
解决猫眼自定义字体的问题-获取影院实时电影拍片或实时票房等