python正则化相关内容(代码示例看了就会)

python正则化相关内容----代码示例看了就会

import re
from tools.infer.predict_rec import application
import os
import jsonclass Invoice(object):def __init__(self):print("chuangjian")# 正则匹配日期def get_date(self, data_str):data_str = data_str.strip()data_str = data_str.replace(' ', '')data_str = data_str.replace('B', '8')data_str = data_str.replace('b', '6')char_list = ['S', 's']for char in char_list:if char in data_str:data_str = data_str.replace(char, '5')char_list = ['I', 'i', 'l']for char in char_list:if char in data_str:data_str = data_str.replace(char, '1')char_list = ['Z', 'z']for char in char_list:if char in data_str:data_str = data_str.replace(char, '2')chars = "QWERTYUIPASDFGHJKLZXCVBNMqwertyuipasdfghjklzxcvbnm!()-_+=[]{};:\'|\\<>?/~`《》.."char_list = ['O', 'o']for char in char_list:if char in data_str:data_str = data_str.replace(char, "0")for char in chars:if char in data_str:data_str = data_str.replace(char, '')# 情形2:如果输入结果刚好是xxxx年xx月xx日的情况p_date_2 = re.compile(r"\d{4}年\d{2}月\d{2}日$")date_2 = p_date_2.findall(data_str)# 情况1:如果输入的结果是   20201016日p_date_1 = re.compile(r"\d{4}.\d{2}.\d{2}.{1}$")date_1 = p_date_1.findall(data_str)# 情况32001012鈤p_date_3 = re.compile(r"(\d{4}).(\d{1,2}).(\d{1,2}).*$")date_3 = p_date_3.findall(data_str)# # 情况4:2011# p_date_4 = re.compile(r"\d{4}.\d{0,1}.$")# date_4 = p_date_4.findall(data_str)# print(date_4)temp_result = ["xxxx", "年", 'xx', '月', 'xx', '日']# 如果刚好满足第二中情况,则返回相应结果if len(date_2) != 0:return "".join(date_2[0])# 即如果第一种情况成立elif len(date_1) != 0:char_list = []for char in date_1[0]:char_list.append(char)# char_list.append("日")date = ''.join(char_list)return dateelif len(date_3) != 0:temp_list = ['', '', '']for index, item in enumerate(date_3[0]):temp_list[index] = itemif (len(date_3[0][0])) == 0:return data_strelif (len(temp_list[0])) != 0:temp_result[0] = temp_list[0]if (len(temp_list[1])) == 0:return data_strelif (len(temp_list[1])) != 0:if (len(temp_list[2])) == 0:if len(temp_list[1]) < 2:gqr_temp = "0" + temp_list[1]temp_result[2] = "0" + temp_list[1]else:gqr_temp = temp_list[1]temp_result[2] = temp_list[1]temp_result[-1] = ''temp_result[-2] = ''gqr_temp = ''.join(temp_result)return gqr_tempelse:if len(temp_list[1]) < 2:gqr_temp = "0" + temp_list[1]temp_result[2] = "0" + temp_list[1]else:gqr_temp = temp_list[1]temp_result[2] = temp_list[1]if len(temp_list[2]) < 2:temp_result[-2] = "0" + temp_list[2]else:temp_result[-2] = temp_list[2]return ''.join(temp_result)else:return data_str# 正则匹配金额def get_money(self, money_str):money_str = money_str.strip()money_str = money_str.replace(' ', '')money_str = money_str.replace('。', '.')money_str = money_str.replace(',', '.')money_str = money_str.replace(',', '.')money_str = money_str.replace('-', '.')money_str = money_str.replace('B', '0')money_str = money_str.replace('b', '6')char_list = ['S', 's']for char in char_list:if char in money_str:money_str = money_str.replace(char, '5')char_list = ['I', 'i', 'l']for char in char_list:if char in money_str:money_str = money_str.replace(char, '1')char_list = ['Z', 'z']for char in char_list:if char in money_str:money_str = money_str.replace(char, '2')char_list = ['O', 'o', 'C', 'c', 'n']for char in char_list:if char in money_str:money_str = money_str.replace(char, '0')p_money = re.compile(r"(\d*\.\d{0,2})$")money = p_money.findall(money_str)if len(money) != 0:money_str = money[0]chars = "QWERTYUIPASDFGHJKLZXVNMqwertyuipasdfghjklzxvbnm!()-_+=[]{};:\'|\\<>?/~`《》"for char in chars:if char in money_str:money_str = money_str.replace(char, '')chars = "0123456789."for char in money_str:if char not in chars:money_str = money_str.replace(char, '')p_money = re.compile(r"(\d*\.{0,1}\d{0,2})$")money = p_money.findall(money_str)str_temp = "".join(money)# print(money,",,,,,,",str_temp)if (len(money) != 0) and ('.' in str_temp):money_list = ["xx", ".", "xx"]money_split = money[0].split(".")# 判断是否出现.20的情况temp_1 = money_split[0]temp_2 = money_split[1]if len(temp_1) == 0:money_list[0] = "0"else:money_list[0] = temp_1# 判断出现10.2的情况if len(temp_2) == 0:money_list[2] = "00"elif len(temp_2) == 2:money_list[2] = temp_2else:temp = list(temp_2)temp.append("0")money_list[2] = ''.join(temp)# print(money_list,"111")money_str = ''.join(money_list)return "¥" + money_strelif ("." not in str_temp):money_list = ["xx", ".", "xx"]money_split = money[0].split(".")# 判断是否出现.20的情况temp_1 = money_split[0]# temp_2 = money_split[1]if len(temp_1) == 0:money_list[0] = "0"else:money_list[0] = temp_1money_list[2] = "00"# print(money_list,"111")money_str = ''.join(money_list)return "¥" + money_strelse:return '¥' + money_str# 正则匹配纳税人识别号def get_mf_code(self, pep_num_str):pep_num_str = pep_num_str.strip()pep_num_str = pep_num_str.replace(" ", '')# 替换掉容易混淆字体 其中"IOSZV为不包含的号码word_1s = ["i", 'I', 'l']  # 此列表中的字符都是数字1for word_1 in word_1s:if word_1 in pep_num_str:pep_num_str = pep_num_str.replace(word_1, "1")word_2s = ["s", "S"]  # 此列表中的字符都是数字5for word_2 in word_2s:if word_2 in pep_num_str:pep_num_str = pep_num_str.replace(word_2, "5")word_3s = ['O', 'o']  # 此列表中的字母都是数字0for word_3 in word_3s:if word_3 in pep_num_str:pep_num_str = pep_num_str.replace(word_3, "0")word_4s = ['V', 'v']  # 取出掉列表中的字母for word_4 in word_4s:if word_4 in pep_num_str:pep_num_str = pep_num_str.replace(word_4, '')# 使用正则化提取相应信息p_pep_num = re.compile(r"\d+.*$")pep_num_str = p_pep_num.findall(pep_num_str)if len(pep_num_str) != 0:pep_num = pep_num_str[0]pep_num = pep_num.upper()  # 全部转为大写# 去除掉非数字与非字母的内容char_str = "QWERTYUIOPASDFGHJKLZXCVBNM0123456789"# 将识别字符串变为列表形式pep_num_list = list(pep_num)# 枚举遍历列表中元素,如果元素不在char_str中,则赋值为空temp_list = []for index, item in enumerate(pep_num_list):if item in char_str:temp_list.append(item)pep_num = ''.join(temp_list)else:return ''return pep_num# 正则匹配地址、电话def get_add_phone(self, imf_str):imf_str = imf_str.strip()re.compile("")# 数字金额转大写def get_num2money(self,change_number):""".转换数字为大写货币格式( format_word.__len__() - 3 + 2位小数 )change_number 支持 float, int, long, string"""format_word = ["分", "角", "圆","拾", "佰", "仟", "万","拾", "佰", "仟", "亿","拾", "佰", "仟", "万","拾", "佰", "仟", "兆"]format_num = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]if type(change_number) == str:# - 如果是字符串,先尝试转换成floatint.if '.' in change_number:try:change_number = float(change_number)except:'%s can\'t change' % change_numberelse:try:change_number = int(change_number)except:'%s   can\'t change' % change_numberif type(change_number) == float:real_numbers = []for i in range(len(format_word) - 3, -3, -1):if change_number >= 10 ** i or i < 1:real_numbers.append(int(round(change_number / (10 ** i), 2) % 10))elif isinstance(change_number, int):real_numbers = []for i in range(len(format_word), -3, -1):if change_number >= 10 ** i or i < 1:real_numbers.append(int(round(change_number / (10 ** i), 2) % 10))else:'%s   can\'t change' % change_numberzflag = 0  # 标记连续0次数,以删除万字,或适时插入零字start = len(real_numbers) - 3change_words = []for i in range(start, -3, -1):  # 使i对应实际位数,负数为角分if 0 < real_numbers[start - i] or len(change_words) == 0:if zflag:change_words.append(format_num[0])zflag = 0change_words.append(format_num[real_numbers[start - i]])change_words.append(format_word[i + 2])elif 0 == i or (0 == i % 4 and zflag < 3):  # 控制 万/元change_words.append(format_word[i + 2])zflag = 0else:zflag += 1if change_words[-1] not in (format_word[0], format_word[1]):# - 最后两位非"角,分"则补"整"change_words.append("整")return ''.join(change_words)# 正则匹配将大写金额变为数字def get_money2num(self, amount):chinese_num = {'零': 0, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9}chinese_amount = {'分': 0.01, '角': 0.1, '元': 1, '拾': 10, '佰': 100, '仟': 1000, '圆': 1}amount_float = 0if '亿' in amount:yi = re.match(r'(.+)亿.*', amount).group(1)amount_yi = 0for i in chinese_amount:if i in yi:amount_yi += chinese_num[yi[yi.index(i) - 1]] * chinese_amount[i]if yi[-1] in chinese_num.keys():amount_yi += chinese_num[yi[-1]]amount_float += amount_yi * 100000000amount = re.sub(r'.+亿', '', amount, count=1)if '万' in amount:wan = re.match(r'(.+)万.*', amount).group(1)amount_wan = 0for i in chinese_amount:if i in wan:amount_wan += chinese_num[wan[wan.index(i) - 1]] * chinese_amount[i]if wan[-1] in chinese_num.keys():amount_wan += chinese_num[wan[-1]]amount_float += amount_wan * 10000amount = re.sub(r'.+万', '', amount, count=1)amount_yuan = 0for i in chinese_amount:if i in amount:if amount[amount.index(i) - 1] in chinese_num.keys():amount_yuan += chinese_num[amount[amount.index(i) - 1]] * chinese_amount[i]amount_float += amount_yuanreturn amount_float# 正则匹配发票号def get_fp_number(self, fp_number):fp_number = fp_number.strip()fp_number = fp_number.replace(" ", "")fp_number = fp_number.replace("B", "8")fp_number = fp_number.replace("b", "6")# 替换掉容易混淆字体 其中"IOSZV为不包含的号码word_1s = ["i", 'I', 'l']  # 此列表中的字符都是数字1for word_1 in word_1s:if word_1 in fp_number:fp_number = fp_number.replace(word_1, "1")word_2s = ["s", "S"]  # 此列表中的字符都是数字5for word_2 in word_2s:if word_2 in fp_number:fp_number = fp_number.replace(word_2, "5")word_3s = ['O', 'o']  # 此列表中的字母都是数字0for word_3 in word_3s:if word_3 in fp_number:fp_number = fp_number.replace(word_3, "0")word_4s = ['Z', 'z']for word_4 in word_4s:if word_4 in fp_number:fp_number = fp_number.replace(word_4, "2")char_str = "0123456789"for char in fp_number:if char not in char_str:fp_number = fp_number.replace(char, "")return fp_number# 正则匹配发票codedef get_fp_code(self, fp_code):fp_code = fp_code.strip()fp_code = fp_code.replace(" ", "")fp_code = fp_code.replace("B", "8")fp_code = fp_code.replace("b", "6")# 替换掉容易混淆字体 其中"IOSZV为不包含的号码word_1s = ["i", 'I', 'l']  # 此列表中的字符都是数字1for word_1 in word_1s:if word_1 in fp_code:fp_code = fp_code.replace(word_1, "1")word_2s = ["s", "S"]  # 此列表中的字符都是数字5for word_2 in word_2s:if word_2 in fp_code:fp_code = fp_code.replace(word_2, "5")word_3s = ['O', 'o']  # 此列表中的字母都是数字0for word_3 in word_3s:if word_3 in fp_code:fp_code = fp_code.replace(word_3, "0")word_4s = ['Z', 'z']for word_4 in word_4s:if word_4 in fp_code:fp_code = fp_code.replace(word_4, "2")char_str = "0123456789"for char in fp_code:if char not in char_str:fp_code = fp_code.replace(char, "")return fp_code# 正则化匹配买方、卖方名称def get_mf_name(self, mf_name):name = mf_name.strip()name = name.replace(" ", '')return name# 正则匹配买方、卖方开户行及账号def get_mf_account(self, mf_account):account = mf_account.strip()account = account.replace(" ", '')account_1_ = ''account_2_ = ''# 先取得开户银行名称p_account_1 = re.compile(r"\D+.\D")  # 匹配非数字的字符account_1 = p_account_1.findall(account)# 如果字符串不为空,则替换掉其中所有的非文字字符if (len(account_1)) != 0:account_1_str = account_1[0]for temp_str in account_1_str:# 判断ASCII码值if ord(temp_str) < 127:account_1_str = account_1_str.replace(temp_str, "")account_1_ = account_1_str# print("*******"*4,account_1_)# 在得到开户行账号p_account_2 = re.compile(r"\d+.*$")account_2 = p_account_2.findall(account)if (len(account_2)) != 0:account_2_str = account_2[0]char_list_1 = ['S', "s"]for char in account_2_str:if char in char_list_1:account_2_str = account_2_str.replace(char, "5")char_list_1 = ['Z', "z"]for char in account_2_str:if char in char_list_1:account_2_str = account_2_str.replace(char, "2")char_list_1 = ['I', "i", "l"]for char in account_2_str:if char in char_list_1:account_2_str = account_2_str.replace(char, "1")char_list_1 = ["O", "o"]for char in account_2_str:if char in char_list_1:account_2_str = account_2_str.replace(char, "0")account_2_str = account_2_str.replace("B", "8")account_2_str = account_2_str.replace("q", "9")# 遍历字符串,消除其中的非数字部分char_str = "0123456789"for char in account_2_str:if char not in char_str:account_2_str = account_2_str.replace(char, "")account_2_ = account_2_str# print("*******" * 4, account_2_)return account_1_ + " " + account_2_# 正则匹配买方、卖方地址与电话;,def get_mf_infor(self, mf_infor):infor = mf_infor.strip()infor = infor.replace(" ", '')infor = infor.replace("_", '-')infor = infor.replace("--", '-')char_str = "~`!@$%^&*+=\"|{}【】[],。《》<>/\\‘;,:"for char in infor:if char in char_str:infor = infor.replace(char, '')infor = infor.replace('‘', "")return inforinvoice = Invoice()
# inf = application()
# for it in inf:
#     money = invoice.get_money(it[0])
#     print(money)# dir_path = r"D:\project\PaddleOCR_test\money"
dir_path = r"D:\project\PaddleOCR_test\json_data"# 调用小写金额函数
def get_money(money_1_pre):if not money_1_pre is None:money_1 = invoice.get_money(money_1_pre)# print(filename, "            原始输入小小写金额:", money_1_pre, "           正则化后的小写金额:", money_1)return money_1else:# print(money_1_pre)return money_1_pre# 调用开票日期
def get_fp_date(fp_date):if not fp_date is None:date = invoice.get_date(fp_date)# print(filename, "            原始开票日期为:", fp_date, "           正则化后的开票日期:", date)return dateelse:# print(fp_date)return fp_date# 调用发票号码
def get_fp_num(fp_num):if not fp_num is None:num = invoice.get_fp_number(fp_num)# print(filename, "            原始发票号码:", fp_num, "           正则化后的发票号码:", num)return numelse:# print(fp_num)return fp_num# 调用发票代码
def get_fp_code(fp_code):if not fp_code is None:code = invoice.get_fp_code(fp_code)# print(filename, "            原始发票代码:", fp_code, "           正则化后的发票代码:", code)return codeelse:# print(fp_code)return fp_code# 调用买方、卖方名称
def get_mf_name(mf_name):if not mf_name is None:name = invoice.get_mf_name(mf_name)# print(filename, "            原始购买方名称:", mf_name, "           正则化后的购买方名称:", name)return nameelse:# print(mf_name)return mf_name# 调用买方、卖方识别号
def get_mf_code(mf_code):if not mf_code is None:code = invoice.get_mf_code(mf_code)# print(filename, "            原始购买方识别号:", mf_code, "           正则化后的购买方识别号:", code)return codeelse:# print(mf_code)return mf_code# 得到买方、卖方开户行及账号
def get_mf_account(mf_account):if not mf_account is None:account = invoice.get_mf_account(mf_account)# print(filename, "            原始购买开户行及账号:", mf_account, "           正则化后的开户行及账号:", account)return accountelse:# print(mf_account)return mf_account# 得到买方、卖方地址与电话
def get_mf_infor(mf_infor):if not mf_infor is None:infor = invoice.get_mf_infor(mf_infor)# print(filename, "            原始购买方地址与电话:", mf_infor, "           正则化后的买方地址与电话:", infor)return inforelse:# print(mf_infor)return mf_infor# 得到大写金额(即传入大写金额与小写的数字金额)
def get_bigmoney(fp_bigmoney, money_1_pre):# 如果传入的小写金额不为None# print(filename, "            原始小写金额:", money_1_pre, "           正则化后的小写金额:", money,"   得到的大写金额为:",num2bigmoney)# 如果传入的大写金额发票不为空,则进行匹配处理if not fp_bigmoney is None:fp_bigmoney = fp_bigmoney.strip()fp_bigmoney = fp_bigmoney.replace(" ", '')word_str = '0123456789'for word in fp_bigmoney:if word in word_str:fp_bigmoney = fp_bigmoney.replace(word, '')# 利用ASCII码去除掉其他杂乱字符for word in fp_bigmoney:if ord(word)<137:fp_bigmoney = fp_bigmoney.replace(word, '')temp_bigmoney=fp_bigmoneychar_list = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖", "分", "角", "圆", "拾", "百", "阡", "万", "亿", "兆", "整"]for bigmoney in fp_bigmoney:if bigmoney not in char_list:if not money_1_pre is None:# 先对传入的数字金额进行正则匹配money = invoice.get_money(money_1_pre)money = money.replace("¥", '')# 如果为0.00元说明在小写金额出现问题,所以就返回原先的数据if money=="0.00":# print("哈哈哈",temp_bigmoney)return temp_bigmoneybreak# 转换得到的大写金额为num2bigmoney = invoice.get_num2money(money)return num2bigmoneybreakreturn fp_bigmoney# print(filename, "            原始购买方地址与电话:", fp_bigmoney, "           正则化后的买方地址与电话:", None)# 如果传入的大写金额为空,则先将小写金额转换,得到大写金额else:if not money_1_pre is None:# 先对传入的数字金额进行正则匹配money = invoice.get_money(money_1_pre)money = money.replace("¥", '')# 转换得到的大写金额为num2bigmoney = invoice.get_num2money(money)return num2bigmoneyelse:return fp_bigmoneyfor parent, dirname, filenames in os.walk(dir_path):for filename in filenames:infor_dit={}infor_dit_goumai={}infor_dit_xiaoshou={}file_path = os.path.join(dir_path, filename)print(file_path)with open(file_path, encoding="utf-8") as f:json_f = json.load(f)#_________________________购买方信息______________________________fp_goumai = json_f["购买方"]if not fp_goumai is None:# 得到买方、卖方--名称name = get_mf_name(fp_goumai["名称"])infor_dit_goumai['名称']=name# 得到买方、卖方--纳税人识别号mf_code = get_mf_code(fp_goumai["纳税人识别号"])infor_dit_goumai['纳税人识别号']=mf_code# 得到地址与电话infor = get_mf_infor(fp_goumai["地址、电话"])infor_dit_goumai['地址、电话']=infor# 得到买方、卖方--开户行账户号account = get_mf_account(fp_goumai['开户行及账号'])infor_dit_goumai['开户行及账号']=accountinfor_dit['购买方'] = infor_dit_goumaielse:infor_dit['购买方']=None# _________________________销售方信息______________________________fp_goumai = json_f["销售方"]if not fp_goumai is None:# 得到买方、卖方--名称name = get_mf_name(fp_goumai["名称"])infor_dit_xiaoshou['名称'] = name# 得到买方、卖方--纳税人识别号mf_code = get_mf_code(fp_goumai["纳税人识别号"])infor_dit_xiaoshou['纳税人识别号'] = mf_code# 得到地址与电话infor = get_mf_infor(fp_goumai["地址、电话"])infor_dit_xiaoshou['地址、电话'] = infor# 得到买方、卖方--开户行账户号account = get_mf_account(fp_goumai['开户行及账号'])infor_dit_xiaoshou['开户行及账号'] = accountinfor_dit['销售方'] = infor_dit_xiaoshouelse:infor_dit['销售方']=Nonefp_date = json_f["开票日期"]date = get_fp_date(fp_date)infor_dit['开票日期'] = datefp_code = json_f["发票代码"]code = get_fp_code(fp_code)infor_dit['发票代码'] = codefp_num = json_f["发票号码"]num = get_fp_num(fp_num)infor_dit['发票号码'] = nummoney_1_pre = json_f['小写金额']li_money = get_money(money_1_pre)infor_dit['小写金额'] = li_moneyfp_bigmoney = json_f["大写金额"]bigmoney = get_bigmoney(fp_bigmoney, money_1_pre)infor_dit['大写金额'] = bigmoneymoney_1_bad = json_f['税前金额']bad_money = get_money(money_1_bad)infor_dit['税前金额'] = bad_moneyprint("*"*20)print(infor_dit)


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部