天天看點

爬蟲---時間轉換子產品

實際爬蟲項目中,會爬取上萬的網站,這麼多網站在抓取資料的過程中如何統一它們的格式是一個很大的問題。這些網站的時間格式千奇百怪,各種語言都有,是以為了友善大多數網站,寫了這個日期格式轉換的腳本

并不足以百分百解決全部網站,但是足以解決大多數網站,

不足之處,根據項目的需要修改吧

全部代碼

import datetime
import re
import time
import logging
from dateutil.parser import parse
from dateutil.relativedelta import relativedelta
from datetime import timezone

date_dict = {'剛剛': 0, '剛剛':0,'今天': 0, '今日': 0, '昨天': 1, '昨日': 1, '前天': 2, '前日': 2}
replace_dict = {'年': '-', '月': '-', '日': ' ', '時': ':','時': ':', '點': ':', '點':':','分': ':', '秒': ' ', '.': '-', '上午': ' ',
                '下午': ' '}
date_before_dict = {'年': relativedelta(years=1), '月': relativedelta(months=1), '個月': relativedelta(months=1),
                    '周': datetime.timedelta(days=7), '星期': datetime.timedelta(days=7), '天': datetime.timedelta(days=1),
                    '日': datetime.timedelta(days=1), '時': datetime.timedelta(hours=1),
                    '小時': datetime.timedelta(hours=1),'時': datetime.timedelta(hours=1),
                    '小時': datetime.timedelta(hours=1),
                    '分': datetime.timedelta(minutes=1), '分鐘': datetime.timedelta(minutes=1),'分鍾': datetime.timedelta(minutes=1),
                    '秒': datetime.timedelta(seconds=1), '秒鐘': datetime.timedelta(seconds=1), '秒鍾': datetime.timedelta(seconds=1)}



def dealstring(date_time):
    date_time=date_time.strip()
    date_time = date_time.upper()
    if "AGO" in date_time:
        try:
            date_time = date_time.replace('MINUTES', "分").replace('MINUTE', "分")
            date_time = date_time.replace("HOURS", "小時").replace("HOUR", "小時")
            date_time = date_time.replace("DAYS", "天").replace("DAY", "天")
            date_time = date_time.replace("WEEKS", "周").replace("WEEK", "周")
            date_time = date_time.replace("MONTHS", "月").replace("MONTH", "月")
            date_time = date_time.replace("YEARS", "年").replace("YEAR", "年").replace(" ", "")
        except:
            date_time=date_time
        try:
            date_time = date_time.replace("AGO", "前").replace(" ","")
        except:
            date_time=date_time
        try:
            pass
        except:
            date_time=date_time

    try:
        date_time.split("atnaujinta".upper())[1].strip()
    except:
        date_time=date_time
    try:
        date_time=date_time.split("PUBLISHED:")[1].strip().replace(".","")
    except:
        date_time = date_time
    try:
        date_time=date_time.split("UPDATED:")[1].strip().replace(".","")
    except:
        date_time = date_time
    try:
        ss = re.findall("[.](.*?)Z$", date_time)
        sp = "." + ss[0] + "Z"
        date_time = date_time.strip(sp)
    except:
        date_time = date_time

    try:
        ss = re.findall("[.](.*?)UTC$", date_time)
        sp = "." + ss[0] + "Z"
        date_time = date_time.strip(sp)
    except:
        date_time = date_time
    try:
        date_time = date_time.split("(")[0]
    except:
        date_time = date_time
    try:
        date_dict = {
            "一月": "01", "二月": "02", "三月": "03", "四月": "04", "五月": "05", "六月": "06", "七月": "07",
            "八月": "08", "九月": "09", "十月": "10", "十一": "11", "十二": "12", "十一月": "11", "十二月": "12"
        }
        result = re.findall(r'[\u4e00-\u9fa5]+', date_time)[0]
        for i in range(len(date_dict.items())):
            if result == list(date_dict.keys())[i]:
                date_time = date_time.replace(result, list(date_dict.values())[i])
            else:
                continue
    except:
        date_time = date_time
    return date_time



def getDateTime(date_time):
    '''
    :param date_time: 傳入的時間參數,必須是字元串
    :return: 字典 {'timestamp': int類型時間戳, 'datetime': str類型的最終的時間格式(%Y-%m-%d %H:%M:%S)}
    '''
    if not isinstance(date_time, str):
        date_time = str(date_time)
    date_time=dealstring(date_time)
    try:
        if date_time.isdigit() and len(date_time) >= 10:
            if len(date_time) == 10:
                return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(date_time)))
            elif len(date_time) == 13:
                return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(date_time) / 1000))
            else:
                return ''

        if date_time == '':
            return ''

        # 小時結尾後面要加上一個數字
        if date_time[-1] == '點' or date_time[-1] == '時':
            date_time += '0'

        # 判斷為24小時制還是12小時制
        ths = '24'
        if '下午' in date_time:
            ths = '12'

        # 處理時間愛你格式20/0903
        if date_time.count('/') == 1 and len(date_time) == 7:
            date_time = date_time.replace('/', '')

        # **時間機關前
        if date_time[-1] == '前' or date_time[-2:] == '之前':
            d_date_time = DateTimebefore(date_time).strftime("%Y-%m-%d %H:%M:%S")
            return d_date_time

        # 前天、今天、昨天類型的時間處理
        for key, value in date_dict.items():
            timestamp = time.time()
            oneday = datetime.timedelta(days=1)
            # 隻有前、昨、今天
            if date_time == key:
                nowtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(timestamp))
                d_date_time = (datetimeConversion(nowtime, ths) - oneday * value).strftime("%Y-%m-%d %H:%M:%S")
                return d_date_time

            # 後面加了時間的
            if key in date_time and len(date_time) > len(key):
                nowtime = time.strftime("%Y-%m-%d", time.localtime(timestamp))
                d_date_time = (datetimeConversion(nowtime) - oneday * value).strftime("%Y-%m-%d")

                date_time = re.sub(key, d_date_time + ' ', date_time)

                d_date_time = datetimeConversion(date_time, ths).strftime("%Y-%m-%d %H:%M:%S")
                return d_date_time

        d_date_time = datetimeConversion(date_time, ths).strftime("%Y-%m-%d %H:%M:%S")
    except Exception as e:
        logging.info(f'-時間解析出錯--{date_time}-{repr(e)}')
        return ''
    return d_date_time



# 日期**時間機關之前轉成datetime類型
def DateTimebefore(date_time):
    '''
    :param date_time: str類型時間
    :return: 傳回datetime類型的最終時間
    '''
    date_time = date_time.replace('前', '').replace('之', '')
    nowtime = datetime.datetime.today()
    # 分離時間和機關
    num = ''
    for i in date_time:
        if i.isdigit():
            num += i
        else:
            break
    unit = date_time.split(num)[-1]
    # 生成最終時間
    d_date_time = nowtime - date_before_dict[unit] * int(num)
    return d_date_time


# 生成日期格式
def datetimeConversion(date_time, ths='24'):
    '''
    :param date_time: 字元串類型的時間
    :param ths: 參數隻能是24或12,代表24小時制或12小時制,預設為24
    :return: 傳回datetime類型的時間格式 %Y-%m-%d %H:%M:%S
    '''
    s = ''

    if ':00' in date_time:
        s += '0'
    date_time = re.sub(r'\s+|星期.?', ' ', date_time)

    for key, value in replace_dict.items():
        date_time = date_time.replace(key, value)
    if date_time[-1] == ':':
        date_time = date_time[:-1]
    # 轉日期格式
    date_time = parse(parse(date_time, yearfirst=True).strftime("%Y-%m-%d %H:%M:%S"), yearfirst=True)
    # 12小時制轉24小時制
    if ths == '12':
        date_time += datetime.timedelta(hours=12)
    # 日期大于目前日期時,年份-1
    if date_time > datetime.datetime.today():
        date_time -= relativedelta(years=1)
    # 隻有日期沒有時間,添加時間
    if (date_time.hour == 0 or date_time.hour == 12) and date_time.minute == 0 and date_time.second == 0 and s == '':
        date_time = datetime.datetime(year=date_time.year, month=date_time.month, day=date_time.day,
                                      hour=datetime.datetime.today().hour, minute=datetime.datetime.today().minute,
                                      second=datetime.datetime.today().second)
    return date_time


# 日期轉時間戳
def timestampConversion(date_time):
    '''
    時間(str類型)轉時間戳
    :param date_time: 輸入的時間
    :return: 傳回str類型的時間戳
    '''
    d_date_time = getDateTime(date_time)
    if d_date_time == '':
        return ''
    return int(time.mktime(time.strptime(d_date_time, "%Y-%m-%d %H:%M:%S")))


if __name__ == '__main__':
    timea = time.time()
    t = "1 year ago"
    a = getDateTime(t)
    print(time.time() - timea)
    print(a)