#!/usr/bin/env python # # Simple program to test our ability to parse an IMAP SEARCH in to a Python # expression # import re import string ############################################################################ # # In order to parse strings we need a spiffy regular expression. Strings are of # three different kinds of things.. either an unadorned atom, a quoted string, # or a literal which is a length prefixed item: {}CRLF where there # are bytes. # _token = re.compile(r""" (?P [^{() \000-\037\0177%*\\\"]+) | (?P \{\d+\}\r\n) | (?P " [^"\\\n]* (?: \\. [^"\\\n]*)* ") """, re.VERBOSE).match (T_ATOM, # 0 T_LITERAL, # 1 T_QUOTED, # 2 T_EOF, # 3 ) = range(4) ############################################################################ # # def parse_astring(str): '''Parse an IMAP "astring" - This is something that is either an IMAP protocol atom or an IMAP protocol string. An IMAP protocol string is either a "quoted" or a "literal." Here is the BNF from the rfc: astring ::= atom / string atom ::= 1*ATOM_CHAR ATOM_CHAR ::= atom_specials ::= "(" / ")" / "{" / SPACE / CTL / list_wildcards / quoted_specials CTL ::= list_wildcards ::= "%" / "*" literal ::= "{" number "}" CRLF *CHAR8 ;; Number represents the number of CHAR8 octets quoted ::= <"> *QUOTED_CHAR <"> QUOTED_CHAR ::= / "\" quoted_specials quoted_specials ::= <"> / "\" string ::= quoted / literal We do all the real magic in an regexp. We only expect one string so we do one match, find out what kind of thing it is. We return a tuple: (, ) - where remainder is everything else after the string. Basically we parse our string by pulling off bits off of the front. This will pull off a string bit. ''' m = _token(str) if not m: raise "Yow! Unable to tokenize!" group = m.lastgroup val = m.group(group) remainder = m.end() if group == 'atom': return (val, str[remainder:]) elif group == 'quoted': return (eval(val), str[remainder:]) elif group == 'literal': len = int(val[1:-3]) val = str[remainder:len + remainder] remainder += len return (val, str[remainder:]) else: raise "group was not matched: %s" % group ############################################################################ # # def parse(search_expr): """This helper function is what does the meat of our parsing. It expects an understandable token that may have optional key words. A token is like 'ALL' or 'BCC' or a sequence like '1,2,4:7,9,10:*' This routine is given a string which has several search expressions in it. We parse off the first valid expression and return a tuple. The tuple is comprised of the Python code equivalent of the first valid expression and the remainder of the string that still needs to be parsed, if any. We expect our caller to AND together all of the python expression fragments we generate and then compile this in to a code fragment object that can be invoked later at will against messages to see if we have a match. """ if search_expr.startswith("ALL"): return ("True", search_expr[3:]) elif search_expr.startswith("ANSWERED"): return ("'\Answered' in db_entry['flags']", search_expr[9:]) elif search_expr.startswith("BCC "): (astring, search_expr) = parse_astring(search_expr[4:]) return ("('bcc' in message and " \ "message['bcc'].lower()." \ "find(lower(r\"\"\"%s\"\"\")) != -1)" % astring.lower(), search_expr) elif search_expr.startswith("BEFORE "): return ("BEFORE NotImplemented", search_expr[7:]) elif search_expr.startswith("BODY "): return ("BODY NotImplemented", search_expr[5:]) elif search_expr.startswith("CC "): (astring, search_expr) = parse_string(search_expr[3:]) return ("('cc' in message and " \ "message['cc'].lower()." \ "find(lower(r\"\"\"%s\"\"\")) != -1)" % astring.lower(), search_expr) elif search_expr.startswith("DELETED"): return ("'\Deleted' in db_entry['flags']", search_expr[7:]) elif search_expr.startswith("FLAGGED"): return ("'\Flagged' in db_entry['flags']", search_expr[7:]) elif search_expr.startswith("FROM "): (astring, search_expr) = parse_string(search_expr[5:]) return ("('from' in message and " \ "message['from'].lower()." \ "find(lower(r\"\"\"%s\"\"\")) != -1)" % astring.lower(), search_expr) elif search_expr.startswith("KEYWORD "): return ("NotImplemented", search_expr[8:]) elif search_expr.startswith("NEW"): return parse("(RECENT UNSEEN)" + search_expr[3:]) elif search_expr.startswith("OLD"): return parse("NOT NEW" + search_expr[3:]) elif search_expr.startswith("ON "): return ("NotImplemented", search_expr[3:]) elif search_expr.startswith("RECENT"): return ("'\Recent' in db_entry['flags']", search_expr[6:]) elif search_expr.startswith("SEEN"): return ("'\Seen' in db_entry['flags']", search_expr[4:]) elif search_expr.startswith("SINCE "): return ("NotImplemented", search_expr[6:]) elif search_expr.startswith("SUBJECT "): (astring, search_expr) = parse_string(search_expr[8:]) return ("('subject' in message and " \ "message['subject'].lower()." \ "find(lower(r\"\"\"%s\"\"\")) != -1)" % astring.lower(), search_expr) elif search_expr.startswith("TEXT "): return ("NotImplemented", search_expr[5:]) elif search_expr.startswith("TO "): (astring, search_expr) = parse_string(search_expr[3:]) return ("('to' in message and " \ "message['to'].lower()." \ "find(lower(r\"\"\"%s\"\"\")) != -1)" % astring.lower(), search_expr) elif search_expr.startswith("UNANSWERED"): return ("'\Answered' not in db_entry['flags']", search_expr[10:]) elif search_expr.startswith("UNDELETED"): return ("'\Deleted' not in db_entry['flags']", search_expr[9:]) elif search_expr.startswith("UNFLAGGED"): return ("'\Flagged' not in db_entry['flags']", search_expr[9:]) elif search_expr.startswith("UNKEYWORD "): return ("NotImplemented", search_expr[10:]) elif search_expr.startswith("UNSEEN"): return ("'\Seen' not in db_entry['flags']", search_expr[6:]) elif search_expr.startswith("DRAFT"): return ("'\Draft' in db_entry['flags']", search_expr[5:]) elif search_expr.startswith("HEADER "): return ("NotImplemented", search_expr[7:]) elif search_expr.startswith("LARGER "): return ("NotImplemented", search_expr[7:]) elif search_expr.startswith("NOT "): (parsed, search_expr) = parse(search_expr[4:]) return ("(not %s)" % parsed, search_expr) elif search_expr.startswith("OR "): (first_half, search_expr) = parse(search_expr[3:]) if not search_expr[0].isspace(): raise "Parse error: Expected ' ': %s" % search_expr else: search_expr = search_expr[1:] (second_half, search_expr) = parse(search_expr) return ("((%s) or (%s))" % (first_half, second_half), search_expr) elif search_expr.startswith("SENTBEFORE "): return ("NotImplemented", search_expr[11:]) elif search_expr.startswith("SENTON "): return ("NotImplemented", search_expr[7:]) elif search_expr.startswith("SENTSINCE "): return ("NotImplemented", search_expr[10:]) elif search_expr.startswith("SMALLER "): return ("NotImplemented", search_expr[8:]) elif search_expr.startswith("UID "): search_expr = seach_string[4:] if search_expr[0].isdigit() or search_expr[0] == '*': seq_end = search_expr.find(' ') if seq_end == -1: return ("UID set NotImplemented", '') else: return ("UID set NotImplemented", search_expr[seq_end:]) else: raise "Syntax error: invalid uid set: %s" % search_expr elif search_expr.startswith("UNDRAFT"): return ("'\Draft' not in db_entry['flags']", search_expr[7:]) elif search_expr[0].isdigit() or search_expr[0] == '*': seq_end = search_expr.find(' ') if seq_end == -1: return ("set NotImplemented", '') else: return ("set NotImplemented", search_expr[seq_end:]) elif search_expr.startswith("("): return ("sub-expressions NotImplemented", '') raise "Unknown syntax: %s" % search_expr ############################################################################ # # class IMAPSearch: """THis is an IMAPSearch object. It can instantiate all the possible criteria for a search of the messages in a mailbox. The possible search parameters are as defined in rfc2060. What this object does is it takes a provided search strng (that we have gotten from a client) and converts it into a Python expression that when applied in an execution environment that has various variables representing a message can return True or False depending on whether that message matches the given search criteria. """ ######################################################################### # # def __init__(self, search_string): """This sets up the IMAPSearch object. It stores the initial search string we were given. It then parses this string in to a Python expression that is intended to be run in a certain kind of environment. After that has been successfully parsed it compiled this Python expression into a Python code object. This lets us execute this object in an efficent form as part of the searching process. """ self.python_search_expr = "" self.search_string = search_string parsed = "" while (len(search_string) > 0): (search_key, search_string) = parse(search_string) print "After parse: (%s, %s)" % (search_key, search_string) if len(search_string) > 0: if len(search_string) < 2: raise "Bad syntax - trailing space not allowed" if search_string[0].isspace(): parsed += "%s and " % search_key print "trailing space.. AND'ing: %s" % parsed search_string = search_string[1:] else: raise "Bad syntax expected SPACE character in search " \ "string at: \"%s\"" % search_strin else: parsed += search_key self.python_search_expr = parsed print "The resulting python expression is: \"%s\"" % parsed self.search_code_object = compile(parsed, '', 'eval') ######################################################################### # # def match(self, message, db_entry, uid, sequence_number): """This will apply the search criteria expressed in the creation of this IMAPSearch against the given message. A complete message consists of the actual message from the MH folder (that we parse in an email object), the database entry that keeps track of this message's flags and other relevant fields, its uid (without the mailbox uid-vv), and the sequence number of this message in the current mailbox. This routine will return True if the message matches, False otherwise. """ return eval(self.search_code_object)