Package Gnumed :: Package pycommon :: Module gmMatchProvider
[frames] | no frames]

Source Code for Module Gnumed.pycommon.gmMatchProvider

  1  """Base classes for match providers. 
  2   
  3  They are used by business objects to give 
  4  phrasewheels the ability to guess phrases. 
  5   
  6  Copyright (C) GNUMed developers 
  7  license: GPL v2 or later 
  8  """ 
  9  __version__ = "$Revision: 1.34 $" 
 10  __author__  = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>" 
 11   
 12  # std lib 
 13  import re as regex, logging 
 14   
 15   
 16  # GNUmed 
 17  from Gnumed.pycommon import gmPG2 
 18   
 19   
 20  _log = logging.getLogger('gm.ui') 
 21  _log.info(__version__) 
 22   
 23   
 24  # these are stripped from the fragment passed to the 
 25  # match provider before looking for matches: 
 26  default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"' 
 27   
 28  # these are used to detect word boundaries which is, 
 29  # in turn, used to normalize word boundaries in the 
 30  # input fragment 
 31  default_word_separators = '[- \t=+&:@]+' 
 32  #============================================================ 
33 -class cMatchProvider(object):
34 """Base class for match providing objects. 35 36 Match sources might be: 37 - database tables 38 - flat files 39 - previous input 40 - config files 41 - in-memory list created on the fly 42 """ 43 print_queries = False 44 #--------------------------------------------------------
45 - def __init__(self):
46 self.setThresholds() 47 48 self._context_vals = {} 49 self.__ignored_chars = regex.compile(default_ignored_chars) 50 # used to normalize word boundaries: 51 self.__word_separators = regex.compile(default_word_separators)
52 #-------------------------------------------------------- 53 # actions 54 #--------------------------------------------------------
55 - def getMatches(self, aFragment = None):
56 """Return matches according to aFragment and matching thresholds. 57 58 FIXME: design decision: we dont worry about data source changes 59 during the lifetime of a MatchProvider 60 FIXME: append _("*get all items*") on truncation 61 """ 62 # sanity check 63 if aFragment is None: 64 raise ValueError, 'Cannot find matches without a fragment.' 65 66 # user explicitly wants all matches 67 if aFragment == u'*': 68 return self.getAllMatches() 69 70 # case insensitivity 71 tmpFragment = aFragment.lower() 72 # remove ignored chars 73 if self.__ignored_chars is not None: 74 tmpFragment = self.__ignored_chars.sub('', tmpFragment) 75 # normalize word separators 76 if self.__word_separators is not None: 77 tmpFragment = u' '.join(self.__word_separators.split(tmpFragment)) 78 # length in number of significant characters only 79 lngFragment = len(tmpFragment) 80 81 # order is important ! 82 if lngFragment >= self.__threshold_substring: 83 return self.getMatchesBySubstr(tmpFragment) 84 elif lngFragment >= self.__threshold_word: 85 return self.getMatchesByWord(tmpFragment) 86 elif lngFragment >= self.__threshold_phrase: 87 return self.getMatchesByPhrase(tmpFragment) 88 else: 89 return (False, [])
90 #--------------------------------------------------------
91 - def getAllMatches(self):
92 raise NotImplementedError
93 #--------------------------------------------------------
94 - def getMatchesByPhrase(self, aFragment):
95 raise NotImplementedError
96 #--------------------------------------------------------
97 - def getMatchesByWord(self, aFragment):
98 raise NotImplementedError
99 #--------------------------------------------------------
100 - def getMatchesBySubstr(self, aFragment):
101 raise NotImplementedError
102 #-------------------------------------------------------- 103 # configuration 104 #--------------------------------------------------------
105 - def setThresholds(self, aPhrase = 1, aWord = 3, aSubstring = 5):
106 """Set match location thresholds. 107 108 - the fragment passed to getMatches() must contain at least this many 109 characters before it triggers a match search at: 110 1) phrase_start - start of phrase (first word) 111 2) word_start - start of any word within phrase 112 3) in_word - _inside_ any word within phrase 113 """ 114 # sanity checks 115 if aSubstring < aWord: 116 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word)) 117 return False 118 if aWord < aPhrase: 119 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase)) 120 return False 121 122 # now actually reassign thresholds 123 self.__threshold_phrase = aPhrase 124 self.__threshold_word = aWord 125 self.__threshold_substring = aSubstring 126 127 return True
128 #--------------------------------------------------------
129 - def _set_word_separators(self, word_separators=None):
130 if word_separators is None: 131 self.__word_separators = None 132 else: 133 self.__word_separators = regex.compile(word_separators)
134
135 - def _get_word_separators(self):
136 if self.__word_separators is None: 137 return None 138 return self.__word_separators.pattern
139 140 word_separators = property(_get_word_separators, _set_word_separators) 141 #--------------------------------------------------------
142 - def _set_ignored_chars(self, ignored_chars=None):
143 if ignored_chars is None: 144 self.__ignored_chars = None 145 else: 146 self.__ignored_chars = regex.compile(ignored_chars)
147
148 - def _get_ignored_chars(self):
149 if self.__ignored_chars is None: 150 return None 151 return self.__ignored_chars.pattern
152 153 ignored_chars = property(_get_ignored_chars, _set_ignored_chars) 154 #--------------------------------------------------------
155 - def set_context (self, context=None, val=None):
156 """Set value to provide context information for matches. 157 158 The matching code may ignore it depending on its exact 159 implementation. Names and values of the context depend 160 on what is being matched. 161 162 <context> -- the *placeholder* key *inside* the context 163 definition, not the context *definition* key 164 """ 165 if context is None: 166 return False 167 self._context_vals[context] = val 168 return True
169 #--------------------------------------------------------
170 - def unset_context(self, context=None):
171 try: 172 del self._context_vals[context] 173 except KeyError: 174 pass
175 #------------------------------------------------------------ 176 # usable instances 177 #------------------------------------------------------------
178 -class cMatchProvider_FixedList(cMatchProvider):
179 """Match provider where all possible options can be held 180 in a reasonably sized, pre-allocated list. 181 """
182 - def __init__(self, aSeq = None):
183 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight) 184 """ 185 if not type(aSeq) in [type(None), type([]), type(())]: 186 _log.error('fixed list match provider argument must be a list/tuple of dicts/None') 187 raise TypeError('fixed list match provider argument must be a list/tuple of dicts/None') 188 189 self.__items = aSeq 190 cMatchProvider.__init__(self)
191 #-------------------------------------------------------- 192 # internal matching algorithms 193 # 194 # if we end up here: 195 # - aFragment will not be "None" 196 # - aFragment will be lower case 197 # - we _do_ deliver matches (whether we find any is a different story) 198 #--------------------------------------------------------
199 - def getMatchesByPhrase(self, aFragment):
200 """Return matches for aFragment at start of phrases.""" 201 matches = [] 202 # look for matches 203 for item in self.__items: 204 # at start of phrase, that is 205 if item['list_label'].lower().startswith(aFragment.lower()): 206 matches.append(item) 207 # no matches found 208 if len(matches) == 0: 209 return (False, []) 210 211 matches.sort(self.__cmp_items) 212 return (True, matches)
213 #--------------------------------------------------------
214 - def getMatchesByWord(self, aFragment):
215 """Return matches for aFragment at start of words inside phrases.""" 216 matches = [] 217 # look for matches 218 for item in self.__items: 219 item_label = item['list_label'].lower() 220 fragment_pos = item_label.find(aFragment.lower()) 221 # found at start of phrase 222 if fragment_pos == 0: 223 matches.append(item) 224 # found as a true substring 225 elif fragment_pos > 0: 226 # but use only if substring is at start of a word 227 if item_label[fragment_pos-1] == u' ': 228 matches.append(item) 229 # no matches found 230 if len(matches) == 0: 231 return (False, []) 232 233 matches.sort(self.__cmp_items) 234 return (True, matches)
235 #--------------------------------------------------------
236 - def getMatchesBySubstr(self, aFragment):
237 """Return matches for aFragment as a true substring.""" 238 matches = [] 239 # look for matches 240 for item in self.__items: 241 if item['list_label'].lower().find(aFragment.lower()) != -1: 242 matches.append(item) 243 # no matches found 244 if len(matches) == 0: 245 return (False, []) 246 247 matches.sort(self.__cmp_items) 248 return (True, matches)
249 #--------------------------------------------------------
250 - def getAllMatches(self):
251 """Return all items.""" 252 matches = self.__items 253 # no matches found 254 if len(matches) == 0: 255 return (False, []) 256 257 matches.sort(self.__cmp_items) 258 return (True, matches)
259 #--------------------------------------------------------
260 - def set_items(self, items):
261 """items must be a list of dicts. Each dict must have the keys (data, list_label, weight)""" 262 self.__items = items
263 #--------------------------------------------------------
264 - def __cmp_items(self, item1, item2):
265 """Compare items based on weight.""" 266 if item1['weight'] == item2['weight']: 267 return 0 268 269 # do it the wrong way round to do sorting/reversing at once 270 if item1['weight'] < item2['weight']: 271 return 1 272 if item1['weight'] > item2['weight']: 273 return -1
274 # ===========================================================
275 -class cMatchProvider_Func(cMatchProvider):
276 """Match provider which searches matches 277 in the results of a function call. 278 """
279 - def __init__(self, get_candidates = None):
280 """get_candidates() must return a list of strings.""" 281 if get_candidates is None: 282 _log.error('must define function to retrieve match candidates list') 283 raise ValueError('must define function to retrieve match candidates list') 284 285 self._get_candidates = get_candidates 286 cMatchProvider.__init__(self)
287 #-------------------------------------------------------- 288 # internal matching algorithms 289 # 290 # if we end up here: 291 # - aFragment will not be "None" 292 # - aFragment will be lower case 293 # - we _do_ deliver matches (whether we find any is a different story) 294 #--------------------------------------------------------
295 - def getMatchesByPhrase(self, aFragment):
296 """Return matches for aFragment at start of phrases.""" 297 matches = [] 298 candidates = self._get_candidates() 299 # look for matches 300 for candidate in candidates: 301 # at start of phrase, that is 302 if aFragment.startswith(candidate['list_label'].lower()): 303 matches.append(candidate) 304 # no matches found 305 if len(matches) == 0: 306 return (False, []) 307 308 matches.sort(self.__cmp_candidates) 309 return (True, matches)
310 #--------------------------------------------------------
311 - def getMatchesByWord(self, aFragment):
312 """Return matches for aFragment at start of words inside phrases.""" 313 matches = [] 314 candidates = self._get_candidates() 315 # look for matches 316 for candidate in candidates: 317 pos = candidate['list_label'].lower().find(aFragment) 318 # pos = string.find(string.lower(candidate['list_label']), aFragment) 319 # found as a true substring 320 # but use only if substring is at start of a word 321 # FIXME: use word seps 322 if (pos == 0) or (candidate['list_label'][pos-1] == u' '): 323 matches.append(candidate) 324 # no matches found 325 if len(matches) == 0: 326 return (False, []) 327 328 matches.sort(self.__cmp_candidates) 329 return (True, matches)
330 #--------------------------------------------------------
331 - def getMatchesBySubstr(self, aFragment):
332 """Return matches for aFragment as a true substring.""" 333 matches = [] 334 candidates = self._get_candidates() 335 # look for matches 336 for candidate in candidates: 337 if candidate['list_label'].lower().find(aFragment) != -1: 338 # if string.find(string.lower(candidate['list_label']), aFragment) != -1: 339 matches.append(candidate) 340 # no matches found 341 if len(matches) == 0: 342 return (False, []) 343 344 matches.sort(self.__cmp_candidates) 345 return (True, matches)
346 #--------------------------------------------------------
347 - def getAllMatches(self):
348 """Return all candidates.""" 349 return self._get_candidates()
350 #--------------------------------------------------------
351 - def __cmp_candidates(self, candidate1, candidate2):
352 """naive ordering""" 353 return 0
354 # FIXME: do ordering 355 # if candidate1 < candidate2: 356 # return -1 357 # if candidate1 == candidate2: 358 # return 0 359 # return 1 360 361 # ===========================================================
362 -class cMatchProvider_SQL2(cMatchProvider):
363 """Match provider which searches matches 364 in possibly several database tables. 365 366 queries: 367 - a list of unicode strings 368 - each string is a query 369 - each string must contain: "... where <column> %(fragment_condition)s ..." 370 - each string can contain in the where clause: "... %(<context_key>)s ..." 371 - each query must return (data, label) 372 373 context definitions to be used in the queries 374 example: {'ctxt_country': {'where_part': 'and country = %(country)s', 'placeholder': 'country'}} 375 """
376 - def __init__(self, queries = None, context = None):
377 if type(queries) != type([]): 378 queries = [queries] 379 380 self._queries = queries 381 382 if context is None: 383 self._context = {} 384 else: 385 self._context = context 386 387 self._args = {} 388 cMatchProvider.__init__(self)
389 #-------------------------------------------------------- 390 # internal matching algorithms 391 # 392 # if we end up here: 393 # - aFragment will not be "None" 394 # - aFragment will be lower case 395 # - we _do_ deliver matches (whether we find any is a different story) 396 #--------------------------------------------------------
397 - def getMatchesByPhrase(self, aFragment):
398 """Return matches for aFragment at start of phrases.""" 399 400 fragment_condition = u"ILIKE %(fragment)s" 401 self._args['fragment'] = u"%s%%" % aFragment 402 403 return self._find_matches(fragment_condition)
404 #--------------------------------------------------------
405 - def getMatchesByWord(self, aFragment):
406 """Return matches for aFragment at start of words inside phrases.""" 407 408 fragment_condition = u"~* %(fragment)s" 409 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False) 410 self._args['fragment'] = u"( %s)|(^%s)" % (aFragment, aFragment) 411 412 return self._find_matches(fragment_condition)
413 #--------------------------------------------------------
414 - def getMatchesBySubstr(self, aFragment):
415 """Return matches for aFragment as a true substring.""" 416 417 fragment_condition = u"ILIKE %(fragment)s" 418 self._args['fragment'] = u"%%%s%%" % aFragment 419 420 return self._find_matches(fragment_condition)
421 #--------------------------------------------------------
422 - def getAllMatches(self):
423 """Return all items.""" 424 return self.getMatchesBySubstr(u'')
425 #--------------------------------------------------------
426 - def _find_matches(self, fragment_condition):
427 if self.print_queries: 428 print "----------------------" 429 matches = [] 430 for query in self._queries: 431 where_fragments = {'fragment_condition': fragment_condition} 432 433 for context_key, context_def in self._context.items(): 434 try: 435 placeholder = context_def['placeholder'] 436 where_part = context_def['where_part'] 437 self._args[placeholder] = self._context_vals[placeholder] 438 # we do have a context value for this key, so add the where condition 439 where_fragments[context_key] = where_part 440 if self.print_queries: 441 print "ctxt ph:", placeholder 442 print "ctxt where:", where_part 443 print "ctxt val:", self._context_vals[placeholder] 444 except KeyError: 445 # we don't have a context value for this key, so skip the where condition 446 where_fragments[context_key] = u'' 447 448 cmd = query % where_fragments 449 450 if self.print_queries: 451 print "class:", self.__class__.__name__ 452 print "ctxt:", self._context_vals 453 print "args:", self._args 454 print "query:", cmd 455 456 try: 457 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}]) 458 except: 459 _log.exception('[%s]: error running match provider SQL, dropping query', self.__class__.__name__) 460 idx = self._queries.index(query) 461 del self._queries[idx] 462 break 463 464 # no matches found: try next query 465 if len(rows) == 0: 466 continue 467 468 for row in rows: 469 match = {'weight': 0} 470 471 try: 472 match['data'] = row['data'] 473 except KeyError: 474 match['data'] = row[0] 475 476 try: 477 match['list_label'] = row['list_label'] 478 except KeyError: 479 match['list_label'] = row[1] 480 481 # explicit "field_label" in result ? 482 try: 483 match['field_label'] = row['field_label'] 484 # no 485 except KeyError: 486 # but does row[2] exist ? 487 try: 488 match['field_label'] = row[2] 489 # no: reuse "list_label" 490 except IndexError: 491 match['field_label'] = match['list_label'] 492 493 # try: 494 # match['label'] = row['label'] 495 # except KeyError: 496 # match['label'] = match['list_label'] 497 498 matches.append(match) 499 500 return (True, matches) 501 502 # none found whatsoever 503 return (False, [])
504 #================================================================ 505 if __name__ == '__main__': 506 pass 507 508 #================================================================ 509