展開全部
收集了所有的英文标點跟常用的中文标點來做判斷. 目前程式輸入的a.txt需要是32313133353236313431303231363533e78988e69d8331333337373539utf8編碼的, 如果你用的是其他編碼格式, 把最後一行的utf8改成你自己用的編碼格式應該就可以了.#! coding: utf8
from __future__ import unicode_literals
import re
non_stops = (
'\uFF02\uFF03\uFF04\uFF05\uFF06\uFF07\uFF08\uFF09\uFF0A\uFF0B\uFF0C\uFF0D'
'\uFF0F\uFF1A\uFF1B\uFF1C\uFF1D\uFF1E\uFF20\uFF3B\uFF3C\uFF3D\uFF3E\uFF3F'
'\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF60'
'\uFF62\uFF63\uFF64'
'\u3000\u3001\u3003'
'\u300B\u300C\u300D\u300E\u300F\u3010\u3011'
'\u3014\u3015\u3016\u3017\u3018\u3019\u301A\u301B\u301C\u301D\u301E\u301F'
'\u3030'
'\u303E\u303F'
'\u2013\u2014'
'\u2018\u2019\u201B\u201C\u201D\u201E\u201F'
'\u2026\u2027'
'\uFE4F'
)
stops = (
'\uFF01'
'\uFF1F'
'\uFF61'
'\u3002'
)
punctuation = non_stops + stops
punctuation += '\u0021-\u002f\u003a-\u0040\u005b-\u0060\u007b-\u007e'
r = re.compile('[{}]'.format(punctuation))
fin = open('a.txt', 'rb')
fout = open('b.txt', 'wb')
[fout.write(e) for e in fin if not r.search(e.decode('utf8'))]