class GetBasicScore:
"""打分系統搭建"""
def __init__(self, sql, root, db, pw, dict_url):
"""這裡注意sql需要放在[sql]中,讀取多個sql, dict_url是字典連結"""
self.sql = sql
self.root = root
self.db = db
self.pw = pw
self.dict_url = dict_url
def sql_to_df(self, one_sql):
"""讀取一個sql"""
basic_url = 'mysql+mysqldb://{root}:{pw}@localhost:3306/{db}?charset=utf8'.format(
root=self.root, pw=self.pw, db=self.db)
engine = create_engine(basic_url)
conn = engine.connect()
df = pd.read_sql(one_sql, conn)
conn.close()
return df
def read_all_sqls(self):
"""讀取多個sql,一個一個拼在後面"""
# 取出第一條sql,讀取進入df1
sql1 = self.sql[0]
print("Loading%s."%sql1)
df1 = self.sql_to_df(sql1)
# 如果存在多條sql,那麼将剩下的讀取并入df1.
if len(self.sql) > 1:
other_sql = self.sql[1:]
for one_sql in other_sql:
print("Loading%s."%one_sql)
df2 = self.sql_to_df(one_sql)
df1 = pd.concat([df1, df2])
del df2
return df1
def get_key_dict(self):
"""從excel中讀取字典,存在字典中,傳回字典+key"""
if 'xls' in self.dict_url:
words = pd.read_excel(self.dict_url)
if 'csv' in self.dict_url:
words = pd.read_csv(self.dict_url)
word_dict = dict()
keys = words.columns
for key in keys:
temp_lists = list(words[key].dropna())
word_dict[key] = temp_lists
return word_dict, keys
def pre_process_df(self, df, new_cols, date_col):
"""df清洗:①通過new_cols,保留關鍵的列;②通過date_col生成新的日期标簽;date_col一般是'date'。③為了防止跨年的周度錯位問題,new_year用來和week合并。"""
df1 = df.reindex(columns = new_cols).dropna(how = 'any')
df1['date'] = pd.to_datetime(df1[date_col])
df1['year'] = df1['date'].apply(lambda x:int(x.year))
df1['month'] = df1['date'].apply(lambda x:int(x.month))
df1['week'] = df1['date'].apply(lambda x:int(x.week))
df1['new_year'] = np.where((df1['week'] >= 50) & (df1['month'] == 1), df1['year'] - 1,
np.where((df1['week'] == 1) & (df1['month'] == 12), df1['year'] + 1,
df1['year']))
return df1
def get_date_tick(self, df, year, tick1):
"""生成新的日期标簽:一般是年(year)+月(month),或者是年(new_year)+周(week)"""
df[year + '-' + tick1] = df.apply(lambda x: str(x[year]) + '-' + str(x[tick1]).zfill(2), axis = 1)
return df
def get_article_score(self, df, word_dict, keys, col_name):
"""df中存着日期+文本;word_dict是存着各個大類的關鍵字典;keys即為大類;col_name一般是df中存放文本的列名,一般是col_name='content';①循環每個大類key,在該大類中,得到每一篇文章的得分②結果放入df中"""
for key in keys:
dicts = word_dict[key]
score_list = []
print(key)
for content in df[col_name]:
score = 0
for k in dicts:
if k in content:
score += content.count(k)
score_list.append(score)
df[key + '_score'] = score_list
return df
def get_period_score(self, df, date_tick, col_name):
"""通過新的date_tick,統計每個period的得分;col_name是df中的存放文本的列名"""
result = df[col_name].groupby(df[date_tick]).count().reset_index()
result.columns = [date_tick, 'article_num']
cols = list(df.columns)
for col in cols:
if 'score' in col:
temp1 = df[col].groupby(df[date_tick]).sum().reset_index()
temp1.columns = [date_tick, col + '_sum']
temp2 = df[col].groupby(df[date_tick]).mean().reset_index()
temp2.columns = [date_tick, col + '_mean']
result = result.merge(temp1, on = [date_tick]).merge(temp2, on = [date_tick])
return result