Package Gnumed :: Package pycommon :: Module gmMatchProvider
[frames] | no frames]

Source Code for Module Gnumed.pycommon.gmMatchProvider

  1  """Base classes for match providers. 
  2   
  3  They are used by business objects to give 
  4  phrasewheels the ability to guess phrases. 
  5   
  6  Copyright (C) GNUMed developers 
  7  license: GPL 
  8  """ 
  9  __version__ = "$Revision: 1.34 $" 
 10  __author__  = "K.Hilbert <Karsten.Hilbert@gmx.net>, I.Haywood <ihaywood@gnu.org>, S.J.Tan <sjtan@bigpond.com>" 
 11   
 12  # std lib 
 13  import string, types, time, sys, re as regex, logging 
 14   
 15   
 16  # GNUmed 
 17  from Gnumed.pycommon import gmPG2 
 18   
 19   
 20  _log = logging.getLogger('gm.ui') 
 21  _log.info(__version__) 
 22   
 23   
 24  default_ignored_chars = "[?!.'\\(){}\[\]<>~#*$%^_]+" + '"' 
 25  default_word_separators = '[- \t=+&:@]+' 
 26  #============================================================ 
27 -class cMatchProvider(object):
28 """Base class for match providing objects. 29 30 Match sources might be: 31 - database tables 32 - flat files 33 - previous input 34 - config files 35 - in-memory list created on the fly 36 """ 37 print_queries = False 38 #--------------------------------------------------------
39 - def __init__(self):
40 self.setThresholds() 41 42 self._context_vals = {} 43 self.__ignored_chars = regex.compile(default_ignored_chars) 44 self.__word_separators = regex.compile(default_word_separators)
45 #-------------------------------------------------------- 46 # actions 47 #--------------------------------------------------------
48 - def getMatches(self, aFragment = None):
49 """Return matches according to aFragment and matching thresholds. 50 51 FIXME: design decision: we dont worry about data source changes 52 during the lifetime of a MatchProvider 53 FIXME: append _("*get all items*") on truncation 54 """ 55 # sanity check 56 if aFragment is None: 57 raise ValueError, 'Cannot find matches without a fragment.' 58 59 # user explicitly wants all matches 60 if aFragment == u'*': 61 return self.getAllMatches() 62 63 # case insensitivity 64 tmpFragment = aFragment.lower() 65 # remove ignored chars 66 if self.__ignored_chars is not None: 67 tmpFragment = self.__ignored_chars.sub('', tmpFragment) 68 # normalize word separators 69 if self.__word_separators is not None: 70 tmpFragment = u' '.join(self.__word_separators.split(tmpFragment)) 71 # length in number of significant characters only 72 lngFragment = len(tmpFragment) 73 74 # order is important ! 75 if lngFragment >= self.__threshold_substring: 76 return self.getMatchesBySubstr(tmpFragment) 77 elif lngFragment >= self.__threshold_word: 78 return self.getMatchesByWord(tmpFragment) 79 elif lngFragment >= self.__threshold_phrase: 80 return self.getMatchesByPhrase(tmpFragment) 81 else: 82 return (False, [])
83 #--------------------------------------------------------
84 - def getAllMatches(self):
85 raise NotImplementedError
86 #--------------------------------------------------------
87 - def getMatchesByPhrase(self, aFragment):
88 raise NotImplementedError
89 #--------------------------------------------------------
90 - def getMatchesByWord(self, aFragment):
91 raise NotImplementedError
92 #--------------------------------------------------------
93 - def getMatchesBySubstr(self, aFragment):
94 raise NotImplementedError
95 #-------------------------------------------------------- 96 # configuration 97 #--------------------------------------------------------
98 - def setThresholds(self, aPhrase = 1, aWord = 3, aSubstring = 5):
99 """Set match location thresholds. 100 101 - the fragment passed to getMatches() must contain at least this many 102 characters before it triggers a match search at: 103 1) phrase_start - start of phrase (first word) 104 2) word_start - start of any word within phrase 105 3) in_word - _inside_ any word within phrase 106 """ 107 # sanity checks 108 if aSubstring < aWord: 109 _log.error('Setting substring threshold (%s) lower than word-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_substring, self.__threshold_word)) 110 return False 111 if aWord < aPhrase: 112 _log.error('Setting word-start threshold (%s) lower than phrase-start threshold (%s) does not make sense. Retaining original thresholds (%s:%s, respectively).' % (aSubstring, aWord, self.__threshold_word, self.__threshold_phrase)) 113 return False 114 115 # now actually reassign thresholds 116 self.__threshold_phrase = aPhrase 117 self.__threshold_word = aWord 118 self.__threshold_substring = aSubstring 119 120 return True
121 #--------------------------------------------------------
122 - def _set_word_separators(self, word_separators=None):
123 if word_separators is None: 124 self.__word_separators = None 125 else: 126 self.__word_separators = regex.compile(word_separators)
127
128 - def _get_word_separators(self):
129 if self.__word_separators is None: 130 return None 131 return self.__word_separators.pattern
132 133 word_separators = property(_get_word_separators, _set_word_separators) 134 #--------------------------------------------------------
135 - def _set_ignored_chars(self, ignored_chars=None):
136 if ignored_chars is None: 137 self.__ignored_chars = None 138 else: 139 self.__ignored_chars = regex.compile(ignored_chars)
140
141 - def _get_ignored_chars(self):
142 if self.__ignored_chars is None: 143 return None 144 return self.__ignored_chars.pattern
145 146 ignored_chars = property(_get_ignored_chars, _set_ignored_chars) 147 #--------------------------------------------------------
148 - def set_context (self, context=None, val=None):
149 """Set value to provide context information for matches. 150 151 The matching code may ignore it depending on its exact 152 implementation. Names and values of the context depend 153 on what is being matched. 154 155 <context> -- the *placeholder* key *inside* the context 156 definition, not the context *definition* key 157 """ 158 if context is None: 159 return False 160 self._context_vals[context] = val 161 return True
162 #--------------------------------------------------------
163 - def unset_context(self, context=None):
164 try: 165 del self._context_vals[context] 166 except KeyError: 167 pass
168 #------------------------------------------------------------ 169 # usable instances 170 #------------------------------------------------------------
171 -class cMatchProvider_FixedList(cMatchProvider):
172 """Match provider where all possible options can be held 173 in a reasonably sized, pre-allocated list. 174 """
175 - def __init__(self, aSeq = None):
176 """aSeq must be a list of dicts. Each dict must have the keys (data, label, weight) 177 """ 178 if not type(aSeq) in [types.ListType, types.TupleType]: 179 _log.error('fixed list match provider argument must be a list or tuple of dicts') 180 raise TypeError('fixed list match provider argument must be a list or tuple of dicts') 181 182 self.__items = aSeq 183 cMatchProvider.__init__(self)
184 #-------------------------------------------------------- 185 # internal matching algorithms 186 # 187 # if we end up here: 188 # - aFragment will not be "None" 189 # - aFragment will be lower case 190 # - we _do_ deliver matches (whether we find any is a different story) 191 #--------------------------------------------------------
192 - def getMatchesByPhrase(self, aFragment):
193 """Return matches for aFragment at start of phrases.""" 194 matches = [] 195 # look for matches 196 for item in self.__items: 197 # at start of phrase, that is 198 if string.find(string.lower(item['label']), aFragment) == 0: 199 matches.append(item) 200 # no matches found 201 if len(matches) == 0: 202 return (False, []) 203 204 matches.sort(self.__cmp_items) 205 return (True, matches)
206 #--------------------------------------------------------
207 - def getMatchesByWord(self, aFragment):
208 """Return matches for aFragment at start of words inside phrases.""" 209 matches = [] 210 # look for matches 211 for item in self.__items: 212 pos = string.find(string.lower(item['label']), aFragment) 213 # found at start of phrase 214 if pos == 0: 215 matches.append(item) 216 # found as a true substring 217 elif pos > 0: 218 # but use only if substring is at start of a word 219 if (item['label'])[pos-1] == ' ': 220 matches.append(item) 221 # no matches found 222 if len(matches) == 0: 223 return (False, []) 224 225 matches.sort(self.__cmp_items) 226 return (True, matches)
227 #--------------------------------------------------------
228 - def getMatchesBySubstr(self, aFragment):
229 """Return matches for aFragment as a true substring.""" 230 matches = [] 231 # look for matches 232 for item in self.__items: 233 if string.find(string.lower(item['label']), aFragment) != -1: 234 matches.append(item) 235 # no matches found 236 if len(matches) == 0: 237 return (False, []) 238 239 matches.sort(self.__cmp_items) 240 return (True, matches)
241 #--------------------------------------------------------
242 - def getAllMatches(self):
243 """Return all items.""" 244 matches = self.__items 245 # no matches found 246 if len(matches) == 0: 247 return (False, []) 248 249 matches.sort(self.__cmp_items) 250 return (True, matches)
251 #--------------------------------------------------------
252 - def set_items(self, items):
253 """items must be a list of dicts. Each dict must have the keys (data, label, weight)""" 254 self.__items = items
255 #--------------------------------------------------------
256 - def __cmp_items(self, item1, item2):
257 """Compare items based on weight.""" 258 if item1['weight'] == item2['weight']: 259 return 0 260 261 # do it the wrong way round to do sorting/reversing at once 262 if item1['weight'] < item2['weight']: 263 return 1 264 if item1['weight'] > item2['weight']: 265 return -1
266 # ===========================================================
267 -class cMatchProvider_Func(cMatchProvider):
268 """Match provider which searches matches 269 in the results of a function call. 270 """
271 - def __init__(self, get_candidates = None):
272 """get_candidates() must return a list of strings.""" 273 if get_candidates is None: 274 _log.error('must define function to retrieve match candidates list') 275 raise ValueError('must define function to retrieve match candidates list') 276 277 self._get_candidates = get_candidates 278 cMatchProvider.__init__(self)
279 #-------------------------------------------------------- 280 # internal matching algorithms 281 # 282 # if we end up here: 283 # - aFragment will not be "None" 284 # - aFragment will be lower case 285 # - we _do_ deliver matches (whether we find any is a different story) 286 #--------------------------------------------------------
287 - def getMatchesByPhrase(self, aFragment):
288 """Return matches for aFragment at start of phrases.""" 289 print "getting phrase matches" 290 matches = [] 291 candidates = self._get_candidates() 292 # look for matches 293 for candidate in candidates: 294 # at start of phrase, that is 295 if aFragment.startswith(candidate['label'].lower()): 296 matches.append(candidate) 297 # no matches found 298 if len(matches) == 0: 299 return (False, []) 300 301 matches.sort(self.__cmp_candidates) 302 return (True, matches)
303 #--------------------------------------------------------
304 - def getMatchesByWord(self, aFragment):
305 """Return matches for aFragment at start of words inside phrases.""" 306 print "getting word matches" 307 matches = [] 308 candidates = self._get_candidates() 309 # look for matches 310 for candidate in candidates: 311 pos = candidate['label'].lower().find(aFragment) 312 # pos = string.find(string.lower(candidate['label']), aFragment) 313 # found as a true substring 314 # but use only if substring is at start of a word 315 # FIXME: use word seps 316 if (pos == 0) or (candidate['label'][pos-1] == ' '): 317 matches.append(candidate) 318 # no matches found 319 if len(matches) == 0: 320 return (False, []) 321 322 matches.sort(self.__cmp_candidates) 323 return (True, matches)
324 #--------------------------------------------------------
325 - def getMatchesBySubstr(self, aFragment):
326 """Return matches for aFragment as a true substring.""" 327 matches = [] 328 candidates = self._get_candidates() 329 # look for matches 330 for candidate in candidates: 331 if candidate['label'].lower().find(aFragment) != -1: 332 # if string.find(string.lower(candidate['label']), aFragment) != -1: 333 matches.append(candidate) 334 # no matches found 335 if len(matches) == 0: 336 return (False, []) 337 338 matches.sort(self.__cmp_candidates) 339 return (True, matches)
340 #--------------------------------------------------------
341 - def getAllMatches(self):
342 """Return all candidates.""" 343 return self._get_candidates()
344 #--------------------------------------------------------
345 - def __cmp_candidates(self, candidate1, candidate2):
346 """naive ordering""" 347 return 0
348 # FIXME: do ordering 349 # if candidate1 < candidate2: 350 # return -1 351 # if candidate1 == candidate2: 352 # return 0 353 # return 1 354 355 # ===========================================================
356 -class cMatchProvider_SQL2(cMatchProvider):
357 """Match provider which searches matches 358 in possibly several database tables. 359 360 queries: 361 - a list of unicode strings 362 - each string is a query 363 - each string must contain: "... where <column> %(fragment_condition)s ..." 364 - each string can contain in the where clause: "... %(<context_key>)s ..." 365 - each query must return (data, label) 366 367 context definitions to be used in the queries 368 example: {'ctxt_country': {'where_part': 'and country = %(country)s', 'placeholder': 'country'}} 369 """
370 - def __init__(self, queries = None, context = None):
371 if type(queries) != types.ListType: 372 queries = [queries] 373 374 self._queries = queries 375 376 if context is None: 377 self._context = {} 378 else: 379 self._context = context 380 381 self._args = {} 382 cMatchProvider.__init__(self)
383 #-------------------------------------------------------- 384 # internal matching algorithms 385 # 386 # if we end up here: 387 # - aFragment will not be "None" 388 # - aFragment will be lower case 389 # - we _do_ deliver matches (whether we find any is a different story) 390 #--------------------------------------------------------
391 - def getMatchesByPhrase(self, aFragment):
392 """Return matches for aFragment at start of phrases.""" 393 394 fragment_condition = u"ILIKE %(fragment)s" 395 self._args['fragment'] = u"%s%%" % aFragment 396 397 return self.__find_matches(fragment_condition)
398 #--------------------------------------------------------
399 - def getMatchesByWord(self, aFragment):
400 """Return matches for aFragment at start of words inside phrases.""" 401 402 fragment_condition = u"~* %(fragment)s" 403 aFragment = gmPG2.sanitize_pg_regex(expression = aFragment, escape_all = False) 404 self._args['fragment'] = u"( %s)|(^%s)" % (aFragment, aFragment) 405 406 return self.__find_matches(fragment_condition)
407 #--------------------------------------------------------
408 - def getMatchesBySubstr(self, aFragment):
409 """Return matches for aFragment as a true substring.""" 410 411 fragment_condition = u"ILIKE %(fragment)s" 412 self._args['fragment'] = u"%%%s%%" % aFragment 413 414 return self.__find_matches(fragment_condition)
415 #--------------------------------------------------------
416 - def getAllMatches(self):
417 """Return all items.""" 418 return self.getMatchesBySubstr(u'')
419 #--------------------------------------------------------
420 - def __find_matches(self, fragment_condition):
421 matches = [] 422 for query in self._queries: 423 where_fragments = {'fragment_condition': fragment_condition} 424 425 for context_key, context_def in self._context.items(): 426 try: 427 placeholder = context_def['placeholder'] 428 where_part = context_def['where_part'] 429 self._args[placeholder] = self._context_vals[placeholder] 430 # we do have a context value for this key, so add the where condition 431 where_fragments[context_key] = where_part 432 if self.print_queries: 433 print placeholder 434 print where_part 435 print self._context_vals[placeholder] 436 except KeyError: 437 # we don't have a context value for this key, so skip the where condition 438 where_fragments[context_key] = u'' 439 440 cmd = query % where_fragments 441 442 if self.print_queries: 443 print self.__class__.__name__ 444 print self._context_vals 445 print self._args 446 print cmd 447 448 try: 449 rows, idx = gmPG2.run_ro_queries(queries = [{'cmd': cmd, 'args': self._args}]) 450 except: 451 _log.exception('[%s]: error running match provider SQL, dropping query', self.__class__.__name__) 452 idx = self._queries.index(query) 453 del self._queries[idx] 454 break 455 456 # no matches found: try next query 457 if len(rows) == 0: 458 continue 459 460 for row in rows: 461 matches.append({'data': row[0], 'label': row[1], 'weight': 0}) 462 463 return (True, matches) 464 # none found whatsoever 465 return (False, [])
466 #================================================================ 467 if __name__ == '__main__': 468 pass 469 470 #================================================================ 471