Package Gnumed :: Package business :: Module gmXdtObjects
[frames] | no frames]

Source Code for Module Gnumed.business.gmXdtObjects

  1  """GNUmed German XDT parsing objects. 
  2   
  3  This encapsulates some of the XDT data into 
  4  objects for easy access. 
  5  """ 
  6  #============================================================== 
  7  # $Source: /cvsroot/gnumed/gnumed/gnumed/client/business/gmXdtObjects.py,v $ 
  8  # $Id: gmXdtObjects.py,v 1.33 2009/05/04 11:39:47 ncq Exp $ 
  9  __version__ = "$Revision: 1.33 $" 
 10  __author__ = "K.Hilbert, S.Hilbert" 
 11  __license__ = "GPL" 
 12   
 13  import os.path, sys, linecache, codecs, re as regex, time, datetime as pyDT, logging 
 14   
 15   
 16  import mx.DateTime as mxDT 
 17   
 18   
 19  if __name__ == '__main__': 
 20          sys.path.insert(0, '../../') 
 21  from Gnumed.pycommon import gmDateTime, gmTools 
 22  from Gnumed.business import gmXdtMappings, gmPerson 
 23   
 24   
 25  _log = logging.getLogger('gm.xdt') 
 26  _log.info(__version__) 
 27   
 28  #============================================================== 
29 -class cDTO_xdt_person(gmPerson.cDTO_person):
30
31 - def store(self):
32 pass
33 #==============================================================
34 -def determine_xdt_encoding(filename=None, default_encoding=None):
35 36 f = codecs.open(filename=filename, mode='rU', encoding='utf8', errors='ignore') 37 38 file_encoding = None 39 for line in f: 40 field = line[3:7] 41 if field in gmXdtMappings._charset_fields: 42 _log.debug('found charset field [%s] in <%s>', field, filename) 43 val = line[7:8] 44 file_encoding = gmXdtMappings._map_field2charset[field][val] 45 _log.debug('encoding in file is "%s" (%s)', file_encoding, val) 46 break 47 f.close() 48 49 if file_encoding is None: 50 _log.debug('no encoding found in <%s>, assuming [%s]', filename, default_encoding) 51 return default_encoding 52 53 return file_encoding
54 #==============================================================
55 -def read_person_from_xdt(filename=None, encoding=None, dob_format=None):
56 57 _map_id2name = { 58 '3101': 'lastnames', 59 '3102': 'firstnames', 60 '3103': 'dob', 61 '3110': 'gender', 62 '3106': 'zipurb', 63 '3107': 'street', 64 '3112': 'zip', 65 '3113': 'urb', 66 '8316': 'source' 67 } 68 69 needed_fields = ( 70 '3101', 71 '3102' 72 ) 73 74 interesting_fields = _map_id2name.keys() 75 76 data = {} 77 78 # try to find encoding if not given 79 if encoding is None: 80 encoding = determine_xdt_encoding(filename=filename) 81 82 xdt_file = codecs.open(filename=filename, mode='rU', encoding=encoding) 83 84 for line in xdt_file: 85 86 # # can't use more than what's interesting ... ;-) 87 # if len(data) == len(interesting_fields): 88 # break 89 90 line = line.replace('\015','') 91 line = line.replace('\012','') 92 93 # xDT line format: aaabbbbcccccccccccCRLF where aaa = length, bbbb = record type, cccc... = content 94 field = line[3:7] 95 # do we care about this line ? 96 if field in interesting_fields: 97 try: 98 already_seen = data[_map_id2name[field]] 99 break 100 except KeyError: 101 data[_map_id2name[field]] = line[7:] 102 103 xdt_file.close() 104 105 # found enough data ? 106 if len(data) < len(needed_fields): 107 raise ValueError('insufficient patient data in XDT file [%s], found only: %s' % (filename, data)) 108 109 from Gnumed.business import gmPerson 110 dto = gmPerson.cDTO_person() 111 112 dto.firstnames = data['firstnames'] 113 dto.lastnames = data['lastnames'] 114 115 # CAVE: different data orders are possible, so configuration may be needed 116 # FIXME: detect xDT version and use default from the standard when dob_format is None 117 try: 118 dob = time.strptime(data['dob'], gmTools.coalesce(dob_format, '%d%m%Y')) 119 dto.dob = pyDT.datetime(dob.tm_year, dob.tm_mon, dob.tm_mday, tzinfo = gmDateTime.gmCurrentLocalTimezone) 120 except KeyError: 121 dto.dob = None 122 123 try: 124 dto.gender = gmXdtMappings.map_gender_xdt2gm[data['gender'].lower()] 125 except KeyError: 126 dto.gender = None 127 128 dto.zip = None 129 try: 130 dto.zip = regex.match('\d{5}', data['zipurb']).group() 131 except KeyError: pass 132 try: 133 dto.zip = data['zip'] 134 except KeyError: pass 135 136 dto.urb = None 137 try: 138 dto.urb = regex.sub('\d{5} ', '', data['zipurb']) 139 except KeyError: pass 140 try: 141 dto.urb = data['urb'] 142 except KeyError: pass 143 144 try: 145 dto.street = data['street'] 146 except KeyError: 147 dto.street = None 148 149 try: 150 dto.source = data['source'] 151 except KeyError: 152 dto.source = None 153 154 return dto
155 #==============================================================
156 -class cLDTFile(object):
157
158 - def __init__(self, filename=None, encoding=None, override_encoding=False):
159 160 file_encoding = determine_xdt_encoding(filename=filename) 161 if file_encoding is None: 162 _log.warning('LDT file <%s> does not specify encoding', filename) 163 if encoding is None: 164 raise ValueError('no encoding specified in file <%s> or method call' % filename) 165 166 if override_encoding: 167 if encoding is None: 168 raise ValueError('no encoding specified in method call for overriding encoding in file <%s>' % filename) 169 self.encoding = encoding 170 else: 171 if file_encoding is None: 172 self.encoding = encoding 173 else: 174 self.encoding = file_encoding 175 176 self.filename = filename 177 178 self.__header = None 179 self.__tail = None
180 #----------------------------------------------------------
181 - def _get_header(self):
182 183 if self.__header is not None: 184 return self.__header 185 186 ldt_file = codecs.open(filename = self.filename, mode = 'rU', encoding = self.encoding) 187 self.__header = [] 188 for line in ldt_file: 189 length, field, content = line[:3], line[3:7], line[7:].replace('\015','').replace('\012','') 190 # loop until found first LG-Bericht 191 if field == u'8000': 192 if content in [u'8202']: 193 break 194 self.__header.append(line) 195 196 ldt_file.close() 197 return self.__header
198 199 header = property(_get_header, lambda x:x) 200 #----------------------------------------------------------
201 - def _get_tail(self):
202 203 if self.__tail is not None: 204 return self.__tail 205 206 ldt_file = codecs.open(filename = self.filename, mode = 'rU', encoding = self.encoding) 207 self.__tail = [] 208 in_tail = False 209 for line in ldt_file: 210 if in_tail: 211 self.__tail.append(line) 212 continue 213 214 length, field, content = line[:3], line[3:7], line[7:].replace('\015','').replace('\012','') 215 216 # loop until found tail 217 if field == u'8000': 218 if content not in [u'8221']: 219 continue 220 in_tail = True 221 self.__tail.append(line) 222 223 ldt_file.close() 224 return self.__tail
225 226 tail = property(_get_tail, lambda x:x) 227 #----------------------------------------------------------
228 - def split_by_patient(self, dir=None, file=None):
229 230 ldt_file = codecs.open(filename = self.filename, mode = 'rU', encoding = self.encoding) 231 out_file = None 232 233 in_patient = False 234 for line in ldt_file: 235 236 if in_patient: 237 out_file.write(line) 238 continue 239 240 length, field, content = line[:3], line[3:7], line[7:].replace('\015','').replace('\012','') 241 242 # start of record 243 if field == u'8000': 244 # start of LG-Bericht 245 if content == u'8202': 246 in_patient = True 247 if out_file is not None: 248 out_file.write(u''.join(self.tail)) 249 out_file.close() 250 #out_file = codecs.open(filename=filename_xxxx, mode=xxxx_'rU', encoding=self.encoding) 251 out_file.write(u''.join(self.header)) 252 else: 253 in_patient = False 254 if out_file is not None: 255 out_file.write(u''.join(self.tail)) 256 out_file.close() 257 258 if out_file is not None: 259 if not out_file.closed: 260 out_file.write(u''.join(self.tail)) 261 out_file.close() 262 263 ldt_file.close()
264 #============================================================== 265 # FIXME: the following *should* get wrapped in class XdtFile ... 266 #--------------------------------------------------------------
267 -def xdt_get_pats(aFile):
268 pat_ids = [] 269 pat_names = [] 270 pats = {} 271 # xDT line format: aaabbbbcccccccccccCRLF where aaa = length, bbbb = record type, cccc... = content 272 # read patient dat 273 for line in fileinput.input(aFile): 274 # remove trailing CR and/or LF 275 line = line.replace('\015','') 276 line = line.replace('\012','') 277 # do we care about this line ? 278 field = line[3:7] 279 # yes, if type = patient id 280 if field == '3000': 281 pat_id = line[7:] 282 if pat_id not in pat_ids: 283 pat_ids.append(pat_id) 284 continue 285 # yes, if type = patient name 286 if field == '3101': 287 pat_name = line [7:] 288 if pat_name not in pat_names: 289 pat_names.append(pat_name) 290 pats[pat_id] = pat_name 291 continue 292 fileinput.close() 293 294 _log.debug("patients found: %s" % len(pat_ids)) 295 return pats
296 #==============================================================
297 -def get_pat_files(aFile, ID, name, patdir = None, patlst = None):
298 _log.debug("getting files for patient [%s:%s]" % (ID, name)) 299 files = patlst.get(aGroup = "%s:%s" % (ID, name), anOption = "files") 300 _log.debug("%s => %s" % (patdir, files)) 301 return [patdir, files]
302 #==============================================================
303 -def split_xdt_file(aFile,patlst,cfg):
304 content=[] 305 lineno = [] 306 307 # xDT line format: aaabbbbcccccccccccCRLF where aaa = length, bbbb = record type, cccc... = content 308 309 content = [] 310 record_start_lines = [] 311 312 # find record starts 313 for line in fileinput.input(aFile): 314 strippedline = line.replace('\015','') 315 strippedline = strippedline.replace('\012','') 316 # do we care about this line ? (records start with 8000) 317 if strippedline[3:7] == '8000': 318 record_start_lines.append(fileinput.filelineno()) 319 320 # loop over patient records 321 for aline in record_start_lines: 322 # WHY +2 ?!? 323 line = linecache.getline(aFile,aline+2) 324 # remove trailing CR and/or LF 325 strippedline = line.replace('\015','') 326 strippedline = strippedline.replace('\012','') 327 # do we care about this line ? 328 field = strippedline[3:7] 329 # extract patient id 330 if field == '3000': 331 ID = strippedline[7:] 332 line = linecache.getline(aFile,aline+3) 333 # remove trailing CR and/or LF 334 strippedline = line.replace('\015','') 335 strippedline = strippedline.replace('\012','') 336 # do we care about this line ? 337 field = strippedline[3:7] 338 if field == '3101': 339 name = strippedline [7:] 340 startline=aline 341 endline=record_start_lines[record_start_lines.index(aline)+1] 342 _log.debug("reading from%s" %str(startline)+' '+str(endline) ) 343 for tmp in range(startline,endline): 344 content.append(linecache.getline(aFile,tmp)) 345 _log.debug("reading %s"%tmp ) 346 hashes = check_for_previous_records(ID,name,patlst) 347 # is this new content ? 348 data_hash = md5.new() # FIXME: use hashlib 349 map(data_hash.update, content) 350 digest = data_hash.hexdigest() 351 if digest not in hashes: 352 pat_dir = cfg.get("xdt-viewer", "export-dir") 353 file = write_xdt_pat_data(content, pat_dir) 354 add_file_to_patlst(ID, name, patlst, file, ahash) 355 content = [] 356 else: 357 continue 358 # cleanup 359 fileinput.close() 360 patlst.store() 361 return 1
362 #==============================================================
363 -def get_rand_fname(aDir):
364 tmpname = gmTools.get_unique_filename(prefix='', suffix = time.strftime(".%Y%m%d-%H%M%S", time.localtime()), tmp_dir=aDir) 365 path, fname = os.path.split(tmpname) 366 return fname
367 #==============================================================
368 -def write_xdt_pat_data(data, aDir):
369 """write record for this patient to new file""" 370 pat_file = open(os.path.join(aDir, get_rand_fname(aDir)), "w") 371 map(pat_file.write, data) 372 pat_file.close() 373 return fname
374 #==============================================================
375 -def check_for_previous_records(ID, name, patlst):
376 anIdentity = "%s:%s" % (ID, name) 377 hashes = [] 378 # patient not listed yet 379 if anIdentity not in patlst.getGroups(): 380 _log.debug("identity not yet in list" ) 381 patlst.set(aGroup = anIdentity, anOption = 'files', aValue = [], aComment = '') 382 # file already listed ? 383 file_defs = patlst.get(aGroup = anIdentity, anOption = "files") 384 for line in file_defs: 385 file, ahash = line.split(':') 386 hashes.append(ahash) 387 388 return hashes
389 #==============================================================
390 -def add_file_to_patlst(ID, name, patlst, new_file, ahash):
391 anIdentity = "%s:%s" % (ID, name) 392 files = patlst.get(aGroup = anIdentity, anOption = "files") 393 for file in new_files: 394 files.append("%s:%s" % (file, ahash)) 395 _log.debug("files now there : %s" % files) 396 patlst.set(aGroup=anIdentity, anOption="files", aValue = files, aComment="")
397 #============================================================== 398 # main 399 #-------------------------------------------------------------- 400 if __name__ == "__main__": 401 from Gnumed.pycommon import gmI18N, gmLog2 402 403 root_log = logging.getLogger() 404 root_log.setLevel(logging.DEBUG) 405 _log = logging.getLogger('gm.xdt') 406 407 #from Gnumed.business import gmPerson 408 gmI18N.activate_locale() 409 gmI18N.install_domain() 410 gmDateTime.init() 411 412 ldt = cLDTFile(filename = sys.argv[1]) 413 print "header:" 414 for line in ldt.header: 415 print line.encode('utf8', 'replace') 416 print "tail:" 417 for line in ldt.tail: 418 print line.encode('utf8', 'replace') 419 420 # # test framework if run by itself 421 # patfile = sys.argv[1] 422 # dobformat = sys.argv[2] 423 # encoding = sys.argv[3] 424 # print "reading patient data from xDT file [%s]" % patfile 425 426 # dto = read_person_from_xdt(patfile, dob_format=dobformat, encoding=encoding) 427 # print "DTO:", dto 428 # print "dto.dob:", dto.dob, type(dto.dob) 429 # print "dto.dob.tz:", dto.dob.tzinfo 430 # print "dto.zip: %s dto.urb: %s" % (dto.zip, dto.urb) 431 # print "dto.street", dto.street 432 # searcher = gmPerson.cPatientSearcher_SQL() 433 # ident = searcher.get_identities(dto=dto)[0] 434 # print ident 435 ## print ident.get_medical_age() 436 437 #============================================================== 438 # $Log: gmXdtObjects.py,v $ 439 # Revision 1.33 2009/05/04 11:39:47 ncq 440 # - md5 is gone 441 # 442 # Revision 1.32 2009/02/18 13:43:38 ncq 443 # - get_unique_filename API change 444 # 445 # Revision 1.31 2009/02/05 21:16:59 ncq 446 # - start supporting importing LDT 447 # 448 # Revision 1.30 2008/01/30 13:34:50 ncq 449 # - switch to std lib logging 450 # 451 # Revision 1.29 2007/07/11 21:05:10 ncq 452 # - use gmTools.get_unique_filename() 453 # 454 # Revision 1.28 2007/06/28 12:34:35 ncq 455 # - handle GDT source field, too 456 # - safer detection of subsequent records 457 # - improved date parsing logic 458 # 459 # Revision 1.27 2007/05/21 13:04:29 ncq 460 # - start class cDTO_xdt_person 461 # 462 # Revision 1.26 2007/02/22 17:28:45 ncq 463 # - improve test suite 464 # 465 # Revision 1.25 2007/01/21 12:20:45 ncq 466 # - add determine_xdt_encoding() 467 # 468 # Revision 1.24 2007/01/16 17:57:54 ncq 469 # - improve test suite 470 # 471 # Revision 1.23 2007/01/16 13:43:10 ncq 472 # - use gmDateTime.gmCurrentLocalTimezone for dto.dob 473 # 474 # Revision 1.22 2007/01/16 12:13:30 ncq 475 # - dto.dob now requires datetime.datetime 476 # - improve test suite 477 # 478 # Revision 1.21 2007/01/16 10:26:29 ncq 479 # - open xdt file in utf8 even for encoding detection since 480 # it can still contain umlauts et al 481 # - fix zipurb vs zip + urb handling 482 # 483 # Revision 1.20 2007/01/04 23:09:38 ncq 484 # - support explicit DOB format in xDT files 485 # 486 # Revision 1.19 2006/12/11 18:53:43 ncq 487 # - make read_person_from_xdt() recognize address data 488 # 489 # Revision 1.18 2006/10/30 16:42:27 ncq 490 # - use more gmXdtMappings 491 # 492 # Revision 1.17 2006/10/08 10:48:28 ncq 493 # - teach xdt reader to derive encoding from gdt 6301 record 494 # 495 # Revision 1.16 2006/09/13 07:54:32 ncq 496 # - clean up imports 497 # - handle source encoding in read_person_from_xdt() 498 # 499 # Revision 1.15 2006/09/12 17:19:53 ncq 500 # - xDT files have the gender in upper or lower case, so normalize to lower 501 # 502 # Revision 1.14 2006/07/22 11:01:00 ncq 503 # - make gender optional 504 # 505 # Revision 1.13 2006/07/19 20:43:59 ncq 506 # - remove cXDTPatient 507 # 508 # Revision 1.12 2006/07/17 18:02:50 ncq 509 # - cleanup, improve testing 510 # - add read_person_from_xdt() and use gmPerson.cDTO_person() 511 # 512 # Revision 1.11 2006/07/13 21:00:32 ncq 513 # - cleanup gender mappings 514 # - streamline cXdtPatient and improve test harness 515 # 516 # Revision 1.10 2006/05/12 12:05:04 ncq 517 # - cleanup 518 # 519 # Revision 1.9 2004/03/20 19:45:49 ncq 520 # - rename gender map 521 # 522 # Revision 1.8 2004/03/18 11:05:00 shilbert 523 # - fixed xDT-parsing in standalone mode 524 # 525 # Revision 1.7 2004/02/25 09:46:20 ncq 526 # - import from pycommon now, not python-common 527 # 528 # Revision 1.6 2003/11/17 10:56:35 sjtan 529 # 530 # synced and commiting. 531 # 532 # Revision 1.1 2003/10/23 06:02:38 sjtan 533 # 534 # manual edit areas modelled after r.terry's specs. 535 # 536 # Revision 1.5 2003/08/28 18:54:32 shilbert 537 # - corrected some minor glitches 538 # 539 # Revision 1.4 2003/08/27 14:58:58 ncq 540 # - added helpers written by shilbert for XdtViewer 541 # 542 # Revision 1.3 2003/04/19 22:56:03 ncq 543 # - speed up __load_data(), better encapsulate xdt file maps 544 # 545 # Revision 1.2 2003/02/18 02:43:16 ncq 546 # - rearranged __getitem__ to check self.__data last 547 # 548 # Revision 1.1 2003/02/17 23:33:14 ncq 549 # - first version 550 # 551