#AE结束日期为空,则设置为2100-01-01,日期含有UNK,则已01替换,开始日期年月日军均未知替换为1900-01-01import pandas as pd
import mymod
import excelformat2
import re
from datetime import datetime
from fuzzywuzzy import fuzz
#文件读入,输出路径定义-----------------------------------------------------------------------------------------------------------------------------
date=mymod.datestr()
writer=pd.ExcelWriter(r'C:\Users\XinXinbuX280\Desktop\阿达木三期p\output\阿达木三期异常有临床意义LB匹配MHAE{}.xlsx'.format(date))file=r'C:\Users\XinXinbuX280\Desktop\阿达木三期p\output\阿达木三期异常有临床意义LB{}.xlsx'.format(date)
file_raw=r'C:\Users\XinXinbuX280\Desktop\阿达木三期p\rawdata\WIBP2018004_阿达木III期_数据库冻结后_Datasets_XLS_Site-All_zh-CN_20210918.xlsx'
file_match=r'C:\Users\XinXinbuX280\Desktop\阿达木三期p\rawdata\LB-AE匹配规则.xlsx'#匹配规则处理-------------------------------------------------------------------------------------------------------------------------------------
matchfile=pd.read_excel(r'C:\Users\XinXinbuX280\Desktop\阿达木三期p\rawdata\LB-AE匹配规则.xlsx')
key=list(matchfile['检查项'])
value=list(matchfile['key'])
match=dict(zip(key,value))for i in match.keys():match[i]=match[i].split(',')
ae=pd.read_excel(file_raw,'AE')
lb=pd.read_excel(file)
mh=pd.read_excel(file_raw,'MH')#日期处理---------------------------------------------------------------------------------------------------#AE、MH结束日期为空处理
ae['AEENDAT']=ae['AEENDAT'].fillna('2100-01-01')#AE无结束日期则将结束日期赋值为较大值2100-01-01
mh['MHENDAT']=mh['MHENDAT'].fillna('2100-01-01')#mh无结束日期则将结束日期赋值为较大值2100-01-01#含UNK的日期处理-替换为01for i inrange(1,len(ae)):ae.loc[i,['AESTDAT']]=re.sub('UNK','01',ae['AESTDAT'][i])ae.loc[i,['AEENDAT']]=re.sub('UNK','01',ae['AEENDAT'][i])print(ae['AESTDAT'][i])#AE日期格式化for i inrange(1,len(ae)):ae.loc[i,'AESTDAT']=datetime.strptime(ae['AESTDAT'][i],'%Y-%m-%d')ae.loc[i,'AEENDAT']=datetime.strptime(ae['AEENDAT'][i],'%Y-%m-%d')#MH特殊点:年份未知,则赋值为1900-01-01for i inrange(1,len(mh)):if mh['MHSTDAT'][i][0]=='U':mh.loc[i,['MHSTDAT']]='1900-01-01'if mh['MHENDAT'][i][0]=='U':mh.loc[i,['MHENDAT']]='2100-01-01'if mh['MHSTDAT'][i][0]!='U'and mh['MHENDAT'][i][0]!='U':mh.loc[i,['MHSTDAT']]=re.sub('UNK','01',mh['MHSTDAT'][i])mh.loc[i,['MHENDAT']]=re.sub('UNK','01',mh['MHENDAT'][i])print(mh['MHSTDAT'][i])#MH日期格式化for i inrange(1,len(mh)):mh.loc[i,'MHSTDAT']=datetime.strptime(mh['MHSTDAT'][i],'%Y-%m-%d')mh.loc[i,'MHENDAT']=datetime.strptime(mh['MHENDAT'][i],'%Y-%m-%d')#定义匹配信息列----------------------------------------------------------------
aeterm=[]
aest=[]
aeed=[]
aeno=[]
mhterm=[]
mhst=[]
mhed=[]
mhno=[]
match_result=[]#处理lb检测日期------------------------------------------------------------------for i inrange(len(lb)):lb.loc[i,['检查日期']]=datetime.strptime(lb['检查日期'][i],'%Y-%m-%d')aeterm.append('#匹配失败')aest.append(' ')aeed.append(' ')aeno.append(' ')mhterm.append('#匹配失败')mhst.append(' ')mhed.append(' ')mhno.append(' ')match_result.append('#匹配失败')#按检查项名称匹配常规访视lb---------------------------------------------------------------------------------------------------------------------------------#AEfor i inrange(len(lb)):if lb['表单名称'][i]!='LB_OTH':for m in match[lb['检查项目'][i]]:print(m)for n inrange(1,len(ae)):if lb['筛选号'][i]==ae['SUBJID'][n]and(lb['检查日期'][i]-ae['AESTDAT'][n]).days>=0and(lb['检查日期'][i]-ae['AEENDAT'][n]).days <=0:iflen(re.findall(m,ae['AETERM'][n]))>0:aeterm[i]=ae['AETERM'][n]aest[i]=ae['AESTDAT'][n]aeed[i]=ae['AEENDAT'][n]aeno[i]=ae['AESEQ'][n]# MHfor i inrange(len(lb)):if lb['表单名称'][i]!='LB_OTH':for m in match[lb['检查项目'][i]]:print(m)for n inrange(1,len(mh)):if lb['筛选号'][i]==mh['SUBJID'][n]and(lb['检查日期'][i]-mh['MHSTDAT'][n]).days>=0and(lb['检查日期'][i]-mh['MHENDAT'][n]).days <=0:iflen(re.findall(m,mh['MHTERM'][n]))>0:mhterm[i]=mh['MHTERM'][n]mhst[i]=mh['MHSTDAT'][n]mhed[i]=mh['MHENDAT'][n]mhno[i]=mh['MHSEQ'][n]#按lb备注匹配非计划访视-------------------------------------------------------------------------------------------------------------------------------------#AEfor i inrange(len(lb)):if lb['表单名称'][i]=='LB_OTH':ratiodic={}for n inrange(1,len(ae)):print(lb['检查日期'][i],ae['AESTDAT'][n],ae['AEENDAT'][n])if(lb['检查日期'][i]-ae['AESTDAT'][n]).days>=0and(lb['检查日期'][i]-ae['AEENDAT'][n]).days <=0and lb['筛选号'][i]==ae['SUBJID'][n]:ratio=fuzz.ratio(lb['备注'][i],ae['AETERM'][n])ratiodic[ratio]=niflen(ratiodic)>0andmax(ratiodic.keys())>=50:num=ratiodic[max(ratiodic.keys())]aeterm[i]=ae['AETERM'][num]aest[i]=ae['AESTDAT'][num]aeed[i]=ae['AEENDAT'][num]aeno[i]=ae['AESEQ'][num]#MHfor i inrange(len(lb)):if lb['表单名称'][i]=='LB_OTH':ratiodic={}for n inrange(1,len(mh)):if(lb['检查日期'][i]-mh['MHSTDAT'][n]).days>=0and(lb['检查日期'][i]-mh['MHENDAT'][n]).days <=0and lb['筛选号'][i]==mh['SUBJID'][n]:ratio=fuzz.ratio(lb['备注'][i],mh['MHTERM'][n])ratiodic[ratio]=niflen(ratiodic)>0andmax(ratiodic.keys())>=50:num=ratiodic[max(ratiodic.keys())]mhterm[i]=mh['MHTERM'][num]mhst[i]=mh['MHSTDAT'][num]mhed[i]=mh['MHENDAT'][num]mhno[i]=mh['MHSEQ'][num]#按检查项名称匹配非计划检查-------------------------------------------------------------------------------------------------------------------------------#AEfor i inrange(len(lb)):if lb['表单名称'][i]=='LB_OTH'and aeterm[i]=='#匹配失败':ratiodic={}for n inrange(1,len(ae)):print(lb['检查日期'][i],ae['AESTDAT'][n],ae['AEENDAT'][n])if(lb['检查日期'][i]-ae['AESTDAT'][n]).days>=0and(lb['检查日期'][i]-ae['AEENDAT'][n]).days <=0and lb['筛选号'][i]==ae['SUBJID'][n]:ratio=fuzz.ratio(lb['检查项目'][i],ae['AETERM'][n])ratiodic[ratio]=niflen(ratiodic)>0andmax(ratiodic.keys())>=50:num=ratiodic[max(ratiodic.keys())]aeterm[i]=ae['AETERM'][num]aest[i]=ae['AESTDAT'][num]aeed[i]=ae['AEENDAT'][num]aeno[i]=ae['AESEQ'][num]#MHfor i inrange(len(lb)):if lb['表单名称'][i]=='LB_OTH'and mhterm[i]=='#匹配失败':ratiodic={}for n inrange(1,len(mh)):if(lb['检查日期'][i]-mh['MHSTDAT'][n]).days>=0and(lb['检查日期'][i]-mh['MHENDAT'][n]).days <=0and lb['筛选号'][i]==mh['SUBJID'][n]:ratio=fuzz.ratio(lb['检查项目'][i],mh['MHTERM'][n])ratiodic[ratio]=niflen(ratiodic)>0andmax(ratiodic.keys())>=50:num=ratiodic[max(ratiodic.keys())]mhterm[i]=mh['MHTERM'][num]mhst[i]=mh['MHSTDAT'][num]mhed[i]=mh['MHENDAT'][num]mhno[i]=mh['MHSEQ'][num]#整合数据------------------------------------------------------------------------------------------------------------------------------------------------
lb.insert(len(lb.columns),'不良事件名称',aeterm)
lb.insert(len(lb.columns),'不良事件序号',aeno)
lb.insert(len(lb.columns),'不良事件开始日期',aest)
lb.insert(len(lb.columns),'不良事件结束日期',aeed)
lb.insert(len(lb.columns),'既往病史名称',mhterm)
lb.insert(len(lb.columns),'既往病史序号',mhno)
lb.insert(len(lb.columns),'既往病史开始日期',mhst)
lb.insert(len(lb.columns),'既往病史结束日期',mhed)#构建匹配结果列for i inrange(len(lb)):if lb['不良事件名称'][i]=='#匹配失败'and lb['既往病史名称'][i]=='#匹配失败':match_result[i]='#匹配失败'if lb['不良事件名称'][i]!='#匹配失败'and lb['既往病史名称'][i]=='#匹配失败':match_result[i]='与AE匹配'if lb['不良事件名称'][i]=='#匹配失败'and lb['既往病史名称'][i]!='#匹配失败':match_result[i]='与MH匹配'if lb['不良事件名称'][i]!='#匹配失败'and lb['既往病史名称'][i]!='#匹配失败':match_result[i]='与AE、MH匹配'
lb.insert(len(lb.columns),'匹配结果',match_result)#调整日期输出格式---------------------------------------------------------------------for i inrange(len(lb)):lb.loc[i,['检查日期']]=datetime.strftime(lb['检查日期'][i],'%Y-%m-%d')if lb['不良事件名称'][i]!='#匹配失败':lb.loc[i,['不良事件开始日期']]=datetime.strftime(lb['不良事件开始日期'][i],'%Y-%m-%d')lb.loc[i,['不良事件结束日期']]=datetime.strftime(lb['不良事件结束日期'][i],'%Y-%m-%d')if lb['既往病史名称'][i]!='#匹配失败':lb.loc[i,['既往病史开始日期']]=datetime.strftime(lb['既往病史开始日期'][i],'%Y-%m-%d')lb.loc[i,['既往病史结束日期']]=datetime.strftime(lb['既往病史结束日期'][i],'%Y-%m-%d')#将整合好的df输出到文件
lb.to_excel(writer,index=False,sheet_name='AEMH匹配到LB')
writer.save()#文件格式调整-----------------------------------------------------------------------------------------------------------
outfile=r'C:\Users\XinXinbuX280\Desktop\阿达木三期p\output\阿达木三期异常有临床意义LB匹配MHAE{}.xlsx'.format(date)
excelformat2.reset_col(outfile)
excelformat2.reset_format(outfile)