Verwenden Sie reguläre Ausdrücke, um Datumsangaben im Text zu extrahieren

In den Vereinigten Staaten wird das Weihnachtsfest 2017 als "25.12.17" ausgedrückt, während in Europa derselbe Tag als "25.12.17" ausgedrückt wird. Der hier erstellte Datums- und Uhrzeit-Extraktor versucht, die obige Darstellungsreihenfolge für zwei Tage / Monate anzupassen, und überprüft, ob es sich um ein gültiges Datum handelt

import re
from datetime import date
import datetime

# 美国日期的正则表达式
us = r'((([01]?\d)[-/]([0123]?\d))([-/]([0123]\d)\d\d)?)'
mdy = re.findall(us, 'Santa came 12/25/2017. An elf appeared 12/12.')
print(mdy)

# 结构化提取的日期
# lstrip() 方法用于截掉字符串左边的空格或指定字符
dates = [{
    
    'mdy': x[0], 'md': x[1], 'm': int(x[2]), 'd': int(x[3]),
             'y': int(x[4].lstrip('/') or 0), 'c': int(x[5] or 0)} for x in mdy]
print(dates)

# 基本的上下文管理
# 使用在内存的结构化数据的上下文中最近读取到的年份来
# 填充任何缺失的数字字段
for i, d in enumerate(dates):
    for k, v in d.items():
        if not v:
            d[k] = dates[max(i - 1, 0)][k]

print(dates)
datetimes = [date(d['y'], d['m'], d['d']) for d in dates]
print(datetimes)


# 欧洲日期的正则表达式
eu = r'((([0123]?\d)[-/]([01]?\d))([-/]([0123]\d)?\d\d)?)'
dmy = re.findall(eu, 'Alan Mathison Turing OBE FRS (23/6/1912-7/6/1954) was an English computer scientist.')
print(dmy)
dmy = re.findall(eu, 'Alan Mathison Turing OBE FRS (23/6/12-7/6/54) was an English computer scientist.')
print(dmy)

# 综合提取
# 识别年份
# 两位数表示的年份,30-99 = 1930-1999
yr_19xx = (r'\b(?P<yr_19xx>' +
           '|'.join('{}'.format(i) for i in range(30, 100)) +
           r')\b')
# 一位或者两位数表示的年份,01-30 = 2001-2030
# 02d 格式一个整数( d )到最小宽度2( 2 )的字段,左侧填充零(前导 0 )
yr_20xx = (r'\b(?P<yr_20xx>' +
           '|'.join('{:02d}'.format(i) for i in range(10)) + '|' +
           '|'.join('{}'.format(i) for i in range(10, 30)) +
           r')\b')
# 3 位数或者 4 位数表示的年份的前几位数字,如“123 A.D.”中的“1”
# 或者“2018”中的“20”
yr_cent = r'\b(?P<yr_cent>' + '|'.join('{}'.format(i) for i in range(1, 40)) + r')'
# 位数或者 4 位数表示的年份的后两位数字,如“123 A.D.”中的“23”
# 或者“2018”中的“18”
yr_ccxx = r'(?P<yr_ccxx>' + '|'.join('{:02d}'.format(i) for i in range(0, 100)) + r')\b'
yr_xxxx = r'\b(?P<yr_xxxx>(' + yr_cent + ')(' + yr_ccxx + r'))\b'
yr = (r'\b(?P<yr>' +
      yr_19xx + '|' + yr_20xx + '|' + yr_xxxx +
      r')\b')
# finditer()函数实现每次只返回一个,并且返回所在的位置
groups = list(re.finditer(yr, "0, 17, 2000, 01, '08, 99, 1984, 2030/1970 85 47 `66"))
full_years = [g['yr'] for g in groups]
print(full_years)

# 用正则表达式识别月份名称
mon_words = 'January February March April May June July August September October November December'
mon = (r'\b(?P<mon>' + '|'.join('{}|{}|{}|{}|{:02d}'.format(m, m[:4], m[:3], i + 1, i + 1) for i, m in enumerate(mon_words.split())) + r')\b')
print(re.findall(mon, 'January has 31 days, February the 2nd month of 12, has 28, except in a Leap Year.'))

# 组合信息提取正则表达式
day = r'(?P<day>' + '|'.join('{:02d}|{}'.format(i, i) for i in range(1, 32)) + r')\b'
eu = (r'\b(' + day.replace('<day', '<eu_day') + r')\b[-,/ ]{0,2}\b(' + mon.replace('<mon', '<eu_mon') + r')\b[-,/ ]{0,2}\b(' + yr.replace('<yr', '<eu_yr') + r')\b')
us = (r'\b(' + mon.replace('<mon', '<us_mon') + r')\b[-,/ ]{0,2}\b(' + day.replace('<day', '<us_day') + r')\b[-,/ ]{0,2}\b(' + yr.replace('<yr', '<us_yr') + r')\b')
date_pattern = r'\b(' + eu + '|' + us + r')\b'
print(list(re.finditer(date_pattern, '31 Oct, 1970 25/12/2017')))

# 验证日期
dates = []
es_groups = list(re.finditer(date_pattern, "0, 12/25/2017, 2000, 01, '08, 99, 1984, 31/10/1970, 85 47 `66"))
for g in es_groups:
    month_num = (g['us_mon'] or g['eu_mon']).strip()
    try:
        month_num = int(month_num)
    except ValueError:
        month_num = [w[:len(month_num)] for w in mon_words].index(month_num) + 1
    date = datetime.date(int(g['us_yr'] or g['eu_yr']), month_num, int(g['us_day'] or g['eu_day']))
    dates.append(date)
print("dates:", dates)

Ich denke du magst

Origin blog.csdn.net/fgg1234567890/article/details/113918834
Empfohlen
Rangfolge