問題描述
提取word文檔中的縮略語,并将文檔内的縮略語替換為全稱

解決方案
首先觀察資料,縮小問題範圍
發現文檔内縮略語基本都以表格方式呈現,一般是兩列,有時也有四列,是以隻考慮解決縮略語為表格且為2、4列的情況,其他特殊情況(段落等非表格形式,列數不為2、4)不考慮
縮略語判斷邏輯主要就是單元格内文本是大寫字元串的比例
代碼裡我寫了兩種邏輯,V1是判斷大寫單元格占總的單元格比例,V2是,奇數列大寫與偶數列非大寫的比例
判斷縮略語提取縮略語
def extract_abbreviation(file, cfd=0.1, judgement='V1'):
start_time = time.time()
all_abrvt = []
for table in file.tables:
try:
width, length = len(table.columns), len(table.rows)
if not (width == 2 or width ==4):
continue
if judgement == 'V1': # 判斷英文大寫占單元格文本比例
cell_text = [cell.text for row in table.rows for cell in row.cells if cell.text]
cell_upper = [i for i in cell_text if i.isupper()]
if 0.4 < (len(cell_upper)+1) / (len(cell_text)+1) < 0.6:
is_abrvt = True
else:
is_abrvt = False
elif judgement == 'V2': # 判斷每列單元格文本類型比例
column_odd_text = [cell.text for i in range(0, width, 2) for cell in table.columns[i].cells if cell.text]
column_even_text = [cell.text for i in range(1, width, 2) for cell in table.columns[i].cells if cell.text]
column_odd_upper = [i for i in column_odd_text if i.isupper()]
column_even_upper = [i for i in column_even_text if not i.isupper()]
if 0.8 < (len(column_odd_upper)+1) / (len(column_even_upper)+1) < 1.2:
is_abrvt = True
else:
is_abrvt = False
if is_abrvt:
abrvt, cfd_num, confidence= [], 0, 0
for i in range(length):
for j in range(0, width, 2):
cell_1, cell_2 = table.rows[i].cells[j].text, table.rows[i].cells[j+1].text
if cell_1.isupper() and len(cell_1)>1 and len(cell_2)>2:
abrvt.append([cell_1, cell_2])
c_word = cell_2.split(' ')
c_head = ''.join([k[0] for k in c_word if k])
if c_head.lower() == cell_1.lower():
cfd_num += 1
if length == 2:
confidence = cfd_num / length
else:
confidence = cfd_num / (length * 2)
if confidence > cfd:
logging.info('***** 發現縮略語表,%s, confidence: %s' % (abrvt, confidence))
all_abrvt.extend(abrvt)
else:
continue
except Exception as e:
logging.error('Error: ' + str(e), exc_info=True)
print(e)
print(all_abrvt)
print('time: ', time.time()-start_time)
return all_abrvt
提取出來的縮略語事例:
文檔内縮略語的替換
這裡函數的輸入是文本段落内容和抓取到的縮略語清單,
def abrvt_replace(file_text_total, abrvt):
try:
file_text = file_text_total.copy()
for i in range(len(file_text)):
abr_text = word_tokenize(file_text[i])
for j in abrvt:
if (file_text[i] != j[0]) and (j[0] in abr_text) and \
(j[1].lower() not in file_text[i].lower()):
for k in range(len(abr_text)):
if abr_text[k] == j[0]:
abr_text[k] = j[1] + "(" + j[0] + ")"
break
file_text[i] = ' '.join(abr_text)
return file_text
except Exception as e:
logging.info("***** 縮略語替換出錯")
logging.error('Error: ' + str(e), exc_info=True)
return file_text_total
以上抓取和替換都不是很嚴謹,總能構造出反例,但實際效果以足夠解決90%以上但問題了