Package Gnumed :: Package pycommon :: Module gmMatchProvider
[frames] | no frames]

Source Code for Module Gnumed.pycommon.gmMatchProvider

  1  """Base classes for match providers. 
  2   
  3  They are used by business objects to give 
  4  phrasewheels the ability to guess phrases. 
  5   
  6  Copyright (C) GNUMed developers 
  7  license: GPL 
  8  """ 
  9  ############################################################################ 
 10  # $Source: /cvsroot/gnumed/gnumed/gnumed/client/pycommon/gmMatchProvider.py,v $ 
 11  # $Id: gmMatchProvider.py,v 1.34 2009/12/21 15:02:17 ncq Exp $ 
 12  __version__ = "$Revision: 1.34 $" 
 13  __author__  = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>" 
 14   
 15  # std lib 
 16  import string, types, time, sys, re as regex, logging 
 17   
 18   
 19  # GNUmed 
 20  from Gnumed.pycommon import gmPG2 
 21   
 22   
 23  _log = logging.getLogger('gm.ui') 
 24  _log.info(__version__) 
 25   
 26   
 27  default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"' 
 28  default_word_separators = '[- \t=+&:@]+' 
 29  #============================================================ 
30 -class cMatchProvider(object):
31 """Base class for match providing objects. 32 33 Match sources might be: 34 - database tables 35 - flat files 36 - previous input 37 - config files 38 - in-memory list created on the fly 39 """ 40 print_queries = False 41 #--------------------------------------------------------
42 - def __init__(self):
43 self.setThresholds() 44 45 self._context_vals = {} 46 self.__ignored_chars = regex.compile(default_ignored_chars) 47 self.__word_separators = regex.compile(default_word_separators)
48 #-------------------------------------------------------- 49 # actions 50 #--------------------------------------------------------
51 - def getMatches(self, aFragment = None):
52 """Return matches according to aFragment and matching thresholds. 53 54 FIXME: design decision: we dont worry about data source changes 55 during the lifetime of a MatchProvider 56 FIXME: append _("*get all items*") on truncation 57 """ 58 # sanity check 59 if aFragment is None: 60 raise ValueError, 'Cannot find matches without a fragment.' 61 62 # user explicitly wants all matches 63 if aFragment == u'*': 64 return self.getAllMatches() 65 66 # case insensitivity 67 tmpFragment = aFragment.lower() 68 # remove ignored chars 69 if self.__ignored_chars is not None: 70 tmpFragment = self.__ignored_chars.sub('', tmpFragment) 71 # normalize word separators 72 if self.__word_separators is not None: 73 tmpFragment = u' '.join(self.__word_separators.split(tmpFragment)) 74 # length in number of significant characters only 75 lngFragment = len(tmpFragment) 76 77 # order is important ! 78 if lngFragment >= self.__threshold_substring: 79 return self.getMatchesBySubstr(tmpFragment) 80 elif lngFragment >= self.__threshold_word: 81 return self.getMatchesByWord(tmpFragment) 82 elif lngFragment >= self.__threshold_phrase: 83 return self.getMatchesByPhrase(tmpFragment) 84 else: 85 return (False, [])
86 #--------------------------------------------------------
87 - def getAllMatches(self):
88 raise NotImplementedError
89 #--------------------------------------------------------
90 - def getMatchesByPhrase(self, aFragment):
91 raise NotImplementedError
92 #--------------------------------------------------------
93 - def getMatchesByWord(self, aFragment):
94 raise NotImplementedError
95 #--------------------------------------------------------
96 - def getMatchesBySubstr(self, aFragment):
97 raise NotImplementedError
98 #-------------------------------------------------------- 99 # configuration 100 #--------------------------------------------------------
101 - def setThresholds(self, aPhrase = 1, aWord = 3, aSubstring = 5):
102 """Set match location thresholds. 103 104 - the fragment passed to getMatches() must contain at least this many 105 characters before it triggers a match search at: 106 1) phrase_start - start of phrase (first word) 107 2) word_start - start of any word within phrase 108 3) in_word - _inside_ any word within phrase 109 """ 110 # sanity checks 111 if aSubstring < aWord: 112 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word)) 113 return False 114 if aWord < aPhrase: 115 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase)) 116 return False 117 118 # now actually reassign thresholds 119 self.__threshold_phrase = aPhrase 120 self.__threshold_word = aWord 121 self.__threshold_substring = aSubstring 122 123 return True
124 #--------------------------------------------------------
125 - def _set_word_separators(self, word_separators=None):
126 if word_separators is None: 127 self.__word_separators = None 128 else: 129 self.__word_separators = regex.compile(word_separators)
130
131 - def _get_word_separators(self):
132 if self.__word_separators is None: 133 return None 134 return self.__word_separators.pattern
135 136 word_separators = property(_get_word_separators, _set_word_separators) 137 #--------------------------------------------------------
138 - def _set_ignored_chars(self, ignored_chars=None):
139 if ignored_chars is None: 140 self.__ignored_chars = None 141 else: 142 self.__ignored_chars = regex.compile(ignored_chars)
143
144 - def _get_ignored_chars(self):
145 if self.__ignored_chars is None: 146 return None 147 return self.__ignored_chars.pattern
148 149 ignored_chars = property(_get_ignored_chars, _set_ignored_chars) 150 #--------------------------------------------------------
151 - def set_context (self, context=None, val=None):
152 """Set value to provide context information for matches. 153 154 The matching code may ignore it depending on its exact 155 implementation. Names and values of the context depend 156 on what is being matched. 157 158 <context> -- the *placeholder* key *inside* the context 159 definition, not the context *definition* key 160 """ 161 if context is None: 162 return False 163 self._context_vals[context] = val 164 return True
165 #--------------------------------------------------------
166 - def unset_context(self, context=None):
167 try: 168 del self._context_vals[context] 169 except KeyError: 170 pass
171 #------------------------------------------------------------ 172 # usable instances 173 #------------------------------------------------------------
174 -class cMatchProvider_FixedList(cMatchProvider):
175 """Match provider where all possible options can be held 176 in a reasonably sized, pre-allocated list. 177 """
178 - def __init__(self, aSeq = None):
179 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight) 180 """ 181 if not type(aSeq) in [types.ListType, types.TupleType]: 182 _log.error('fixed list match provider argument must be a list or tuple of dicts') 183 raise TypeError('fixed list match provider argument must be a list or tuple of dicts') 184 185 self.__items = aSeq 186 cMatchProvider.__init__(self)
187 #-------------------------------------------------------- 188 # internal matching algorithms 189 # 190 # if we end up here: 191 # - aFragment will not be "None" 192 # - aFragment will be lower case 193 # - we _do_ deliver matches (whether we find any is a different story) 194 #--------------------------------------------------------
195 - def getMatchesByPhrase(self, aFragment):
196 """Return matches for aFragment at start of phrases.""" 197 matches = [] 198 # look for matches 199 for item in self.__items: 200 # at start of phrase, that is 201 if string.find(string.lower(item['label']), aFragment) == 0: 202 matches.append(item) 203 # no matches found 204 if len(matches) == 0: 205 return (False, []) 206 207 matches.sort(self.__cmp_items) 208 return (True, matches)
209 #--------------------------------------------------------
210 - def getMatchesByWord(self, aFragment):
211 """Return matches for aFragment at start of words inside phrases.""" 212 matches = [] 213 # look for matches 214 for item in self.__items: 215 pos = string.find(string.lower(item['label']), aFragment) 216 # found at start of phrase 217 if pos == 0: 218 matches.append(item) 219 # found as a true substring 220 elif pos > 0: 221 # but use only if substring is at start of a word 222 if (item['label'])[pos-1] == ' ': 223 matches.append(item) 224 # no matches found 225 if len(matches) == 0: 226 return (False, []) 227 228 matches.sort(self.__cmp_items) 229 return (True, matches)
230 #--------------------------------------------------------
231 - def getMatchesBySubstr(self, aFragment):
232 """Return matches for aFragment as a true substring.""" 233 matches = [] 234 # look for matches 235 for item in self.__items: 236 if string.find(string.lower(item['label']), aFragment) != -1: 237 matches.append(item) 238 # no matches found 239 if len(matches) == 0: 240 return (False, []) 241 242 matches.sort(self.__cmp_items) 243 return (True, matches)
244 #--------------------------------------------------------
245 - def getAllMatches(self):
246 """Return all items.""" 247 matches = self.__items 248 # no matches found 249 if len(matches) == 0: 250 return (False, []) 251 252 matches.sort(self.__cmp_items) 253 return (True, matches)
254 #--------------------------------------------------------
255 - def set_items(self, items):
256 """items must be a list of dicts. Each dict must have the keys (data, label, weight)""" 257 self.__items = items
258 #--------------------------------------------------------
259 - def __cmp_items(self, item1, item2):
260 """Compare items based on weight.""" 261 if item1['weight'] == item2['weight']: 262 return 0 263 264 # do it the wrong way round to do sorting/reversing at once 265 if item1['weight'] < item2['weight']: 266 return 1 267 if item1['weight'] > item2['weight']: 268 return -1
269 # ===========================================================
270 -class cMatchProvider_Func(cMatchProvider):
271 """Match provider which searches matches 272 in the results of a function call. 273 """
274 - def __init__(self, get_candidates = None):
275 """get_candidates() must return a list of strings.""" 276 if get_candidates is None: 277 _log.error('must define function to retrieve match candidates list') 278 raise ArgumentError('must define function to retrieve match candidates list') 279 280 self._get_candidates = get_candidates 281 cMatchProvider.__init__(self)
282 #-------------------------------------------------------- 283 # internal matching algorithms 284 # 285 # if we end up here: 286 # - aFragment will not be "None" 287 # - aFragment will be lower case 288 # - we _do_ deliver matches (whether we find any is a different story) 289 #--------------------------------------------------------
290 - def getMatchesByPhrase(self, aFragment):
291 """Return matches for aFragment at start of phrases.""" 292 print "getting phrase matches" 293 matches = [] 294 candidates = self._get_candidates() 295 # look for matches 296 for candidate in candidates: 297 # at start of phrase, that is 298 if aFragment.startswith(candidate['label'].lower()): 299 matches.append(candidate) 300 # no matches found 301 if len(matches) == 0: 302 return (False, []) 303 304 matches.sort(self.__cmp_candidates) 305 return (True, matches)
306 #--------------------------------------------------------
307 - def getMatchesByWord(self, aFragment):
308 """Return matches for aFragment at start of words inside phrases.""" 309 print "getting word matches" 310 matches = [] 311 candidates = self._get_candidates() 312 # look for matches 313 for candidate in candidates: 314 pos = candidate['label'].lower().find(aFragment) 315 # pos = string.find(string.lower(candidate['label']), aFragment) 316 # found as a true substring 317 # but use only if substring is at start of a word 318 # FIXME: use word seps 319 if (pos == 0) or (candidate['label'][pos-1] == ' '): 320 matches.append(candidate) 321 # no matches found 322 if len(matches) == 0: 323 return (False, []) 324 325 matches.sort(self.__cmp_candidates) 326 return (True, matches)
327 #--------------------------------------------------------
328 - def getMatchesBySubstr(self, aFragment):
329 """Return matches for aFragment as a true substring.""" 330 matches = [] 331 candidates = self._get_candidates() 332 # look for matches 333 for candidate in candidates: 334 if candidate['label'].lower().find(aFragment) != -1: 335 # if string.find(string.lower(candidate['label']), aFragment) != -1: 336 matches.append(candidate) 337 # no matches found 338 if len(matches) == 0: 339 return (False, []) 340 341 matches.sort(self.__cmp_candidates) 342 return (True, matches)
343 #--------------------------------------------------------
344 - def getAllMatches(self):
345 """Return all candidates.""" 346 return self._get_candidates()
347 #--------------------------------------------------------
348 - def __cmp_candidates(self, candidate1, candidate2):
349 """naive ordering""" 350 return 0
351 # FIXME: do ordering 352 # if candidate1 < candidate2: 353 # return -1 354 # if candidate1 == candidate2: 355 # return 0 356 # return 1 357 358 # ===========================================================
359 -class cMatchProvider_SQL2(cMatchProvider):
360 """Match provider which searches matches 361 in possibly several database tables. 362 363 queries: 364 - a list of unicode strings 365 - each string is a query 366 - each string must contain: "... where <column> %(fragment_condition)s ..." 367 - each string can contain in the where clause: "... %(<context_key>)s ..." 368 369 context definitions to be used in the queries 370 example: {'ctxt_country': {'where_part': 'and country = %(country)s', 'placeholder': 'country'}} 371 """
372 - def __init__(self, queries = None, context = None):
373 if type(queries) != types.ListType: 374 queries = [queries] 375 376 self._queries = queries 377 378 if context is None: 379 self._context = {} 380 else: 381 self._context = context 382 383 self._args = {} 384 cMatchProvider.__init__(self)
385 #-------------------------------------------------------- 386 # internal matching algorithms 387 # 388 # if we end up here: 389 # - aFragment will not be "None" 390 # - aFragment will be lower case 391 # - we _do_ deliver matches (whether we find any is a different story) 392 #--------------------------------------------------------
393 - def getMatchesByPhrase(self, aFragment):
394 """Return matches for aFragment at start of phrases.""" 395 fragment_condition = u"ilike %(fragment)s" 396 self._args['fragment'] = u"%s%%" % aFragment 397 return self.__find_matches(fragment_condition)
398 #--------------------------------------------------------
399 - def getMatchesByWord(self, aFragment):
400 """Return matches for aFragment at start of words inside phrases.""" 401 fragment_condition = u"~* %(fragment)s" 402 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False) 403 self._args['fragment'] = u"( %s)|(^%s)" % (aFragment, aFragment) 404 return self.__find_matches(fragment_condition)
405 #--------------------------------------------------------
406 - def getMatchesBySubstr(self, aFragment):
407 """Return matches for aFragment as a true substring.""" 408 fragment_condition = u"ilike %(fragment)s" 409 self._args['fragment'] = u"%%%s%%" % aFragment 410 return self.__find_matches(fragment_condition)
411 #--------------------------------------------------------
412 - def getAllMatches(self):
413 """Return all items.""" 414 return self.getMatchesBySubstr(u'')
415 #--------------------------------------------------------
416 - def __find_matches(self, fragment_condition):
417 matches = [] 418 for query in self._queries: 419 where_fragments = {'fragment_condition': fragment_condition} 420 421 for context_key, context_def in self._context.items(): 422 try: 423 placeholder = context_def['placeholder'] 424 where_part = context_def['where_part'] 425 self._args[placeholder] = self._context_vals[placeholder] 426 # we do have a context value for this key, so add the where condition 427 where_fragments[context_key] = where_part 428 except KeyError: 429 # we don't have a context value for this key, so skip the where condition 430 where_fragments[context_key] = u'' 431 432 cmd = query % where_fragments 433 434 if self.print_queries: 435 print self.__class__.__name__ 436 print self._context_vals 437 print self._args 438 print cmd 439 440 try: 441 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}]) 442 except: 443 _log.exception('Error running match provider SQL, dropping query.') 444 idx = self._queries.index(query) 445 del self._queries[idx] 446 break 447 448 # no matches found: try next query 449 if len(rows) == 0: 450 continue 451 452 for row in rows: 453 matches.append({'data': row[0], 'label': row[1], 'weight': 0}) 454 455 return (True, matches) 456 # none found whatsoever 457 return (False, [])
458 #================================================================ 459 if __name__ == '__main__': 460 pass 461 462 #================================================================ 463 # $Log: gmMatchProvider.py,v $ 464 # Revision 1.34 2009/12/21 15:02:17 ncq 465 # - fix typo 466 # 467 # Revision 1.33 2009/04/05 17:58:27 ncq 468 # - improved docs 469 # 470 # Revision 1.32 2009/04/03 09:34:06 ncq 471 # - some exception cleanup 472 # 473 # Revision 1.31 2009/03/01 18:07:14 ncq 474 # - factor out default ignored chars/word separators onto module level 475 # 476 # Revision 1.30 2009/01/21 22:34:09 ncq 477 # - make FixedList match provider work nicely again 478 # 479 # Revision 1.29 2008/06/16 15:02:35 ncq 480 # - cleanup 481 # - remove unneeded methods 482 # 483 # Revision 1.28 2008/06/15 20:31:10 ncq 484 # - make match provider derive from object 485 # - turn ignored chars and word separators into properties 486 # - raise NotImplementedError in base match provider 487 # - remove dis/enableMatching 488 # 489 # Revision 1.27 2008/06/09 15:28:21 ncq 490 # - .print_queries and support it in sql provider 491 # 492 # Revision 1.26 2008/04/29 18:29:29 ncq 493 # - remove increaseScore 494 # 495 # Revision 1.25 2007/12/12 16:17:15 ncq 496 # - better logger names 497 # 498 # Revision 1.24 2007/12/11 14:31:11 ncq 499 # - use std logging 500 # 501 # Revision 1.23 2007/12/02 20:59:13 ncq 502 # - drop failing queries 503 # 504 # Revision 1.22 2007/07/03 15:57:24 ncq 505 # - use gmPG2.sanitize_pg_regex() 506 # - ignore failing match retrieval queries such 507 # that we don't freak out in the phrasewheel 508 # 509 # Revision 1.21 2007/01/07 23:02:11 ncq 510 # - more documentation on context 511 # 512 # Revision 1.20 2006/11/06 09:59:42 ncq 513 # - when allowing non-list strings to turn into query list do not 514 # str() them or else we may lose unicodity 515 # - more u''ing 516 # 517 # Revision 1.19 2006/11/05 16:07:31 ncq 518 # - *_SQL2 now really handles context values, tested, too 519 # - some u''-ification 520 # - don't sort items in *_SQL2, rely on in-query ORDER BY instead 521 # 522 # Revision 1.18 2006/10/24 13:18:29 ncq 523 # - switch to gmPG2 524 # - remove cMatchProvider_SQL() 525 # 526 # Revision 1.17 2006/05/25 22:13:30 ncq 527 # - robustify set_context() 528 # 529 # Revision 1.16 2006/05/01 18:46:05 ncq 530 # - cleanup 531 # 532 # Revision 1.15 2005/06/14 18:54:40 ncq 533 # - don't sort in SQL2 matcher - queries should ORDER BY 534 # 535 # Revision 1.14 2005/06/12 21:20:55 ncq 536 # - make SQL2 match provider more robust regarding query list 537 # 538 # Revision 1.13 2005/06/12 21:16:55 ncq 539 # - make SQL2 match provider accept a query list 540 # 541 # Revision 1.12 2005/06/10 17:07:34 cfmoro 542 # Fixed set_context in SQL2 543 # 544 # Revision 1.11 2005/06/08 01:27:12 cfmoro 545 # Renamed function to make parent set_context work 546 # 547 # Revision 1.10 2005/06/07 10:16:37 ncq 548 # - setContext -> set_context 549 # 550 # Revision 1.9 2005/05/08 21:40:57 ncq 551 # - cleanup 552 # 553 # Revision 1.8 2005/04/14 18:24:57 ncq 554 # - some cleanup of funky magic so we are faster 555 # 556 # Revision 1.7 2005/04/11 18:00:54 ncq 557 # - cleanup 558 # 559 # Revision 1.6 2005/03/14 14:35:27 ncq 560 # - add match provider class cMatchProvider_Func which pulls 561 # match candidates through a function 562 # 563 # Revision 1.5 2004/07/17 21:08:51 ncq 564 # - gmPG.run_query() now has a verbosity parameter, so use it 565 # 566 # Revision 1.4 2004/05/02 22:54:43 ncq 567 # - cleanup 568 # 569 # Revision 1.3 2004/04/30 09:10:57 ncq 570 # - label needs to be str()ed in list.append() 571 # 572 # Revision 1.2 2004/03/10 12:56:01 ihaywood 573 # fixed sudden loss of main.shadow 574 # more work on referrals, 575 # 576 # Revision 1.1 2004/02/25 09:30:13 ncq 577 # - moved here from python-common 578 # 579 # Revision 1.13 2004/01/12 13:10:27 ncq 580 # - remove debugging code 581 # 582 # Revision 1.12 2004/01/06 10:02:47 ncq 583 # - add _SQL2 match provider that operates on queries rather than tables/columns 584 # 585 # Revision 1.11 2003/12/29 16:28:04 uid66147 586 # - I think we got the indentation level wrong when 587 # applying the extra condition default context 588 # 589 # Revision 1.10 2003/11/20 08:55:05 ncq 590 # - some internal cleanup/renaming 591 # 592 # Revision 1.9 2003/11/20 02:16:03 sjtan 593 # 594 # make __context_val in base class gmMatchProvider protected instead of class private, so subclasses can 595 # access it. 596 # 597 # Revision 1.8 2003/11/20 01:37:05 sjtan 598 # 599 # syntax correction. 600 # 601 # Revision 1.7 2003/11/20 00:33:12 ncq 602 # - improve comments on extra conditions in __find_matches() 603 # 604 # Revision 1.6 2003/11/19 23:18:37 ncq 605 # - some cleanup 606 # 607