X-Git-Url: https://git.ozlabs.org/?a=blobdiff_plain;ds=sidebyside;f=apps%2Fpatchwork%2Fbin%2Fparsemail.py;h=68bd94cd37174a4f25851f2a2596cea2e56f34d9;hb=7d9334e879857f8a380bc9509b6cbf9972cecc25;hp=3fbe4735101c31e31a4d5b6c9384e910b062f339;hpb=ae13beca83bcd061d7b0158ed4bdd45abf2eb855;p=patchwork diff --git a/apps/patchwork/bin/parsemail.py b/apps/patchwork/bin/parsemail.py index 3fbe473..68bd94c 100755 --- a/apps/patchwork/bin/parsemail.py +++ b/apps/patchwork/bin/parsemail.py @@ -26,11 +26,11 @@ import time import operator from email import message_from_file try: - from email.header import Header + from email.header import Header, decode_header from email.utils import parsedate_tz, mktime_tz except ImportError: # Python 2.4 compatibility - from email.Header import Header + from email.Header import Header, decode_header from email.Utils import parsedate_tz, mktime_tz from patchwork.parser import parse_patch @@ -38,13 +38,36 @@ from patchwork.models import Patch, Project, Person, Comment list_id_headers = ['List-ID', 'X-Mailing-List'] +whitespace_re = re.compile('\s+') +def normalise_space(str): + return whitespace_re.sub(' ', str).strip() + +def clean_header(header): + """ Decode (possibly non-ascii) headers """ + + def decode(fragment): + (frag_str, frag_encoding) = fragment + if frag_encoding: + return frag_str.decode(frag_encoding) + return frag_str.decode() + + fragments = map(decode, decode_header(header)) + + return normalise_space(u' '.join(fragments)) + def find_project(mail): project = None - listid_re = re.compile('.*<([^>]+)>.*', re.S) + listid_res = [re.compile('.*<([^>]+)>.*', re.S), + re.compile('^([\S]+)$', re.S)] for header in list_id_headers: if header in mail: - match = listid_re.match(mail.get(header)) + + for listid_re in listid_res: + match = listid_re.match(mail.get(header)) + if match: + break + if not match: continue @@ -60,7 +83,7 @@ def find_project(mail): def find_author(mail): - from_header = mail.get('From').strip() + from_header = clean_header(mail.get('From')) (name, email) = (None, None) # tuple of (regex, fn) @@ -93,7 +116,7 @@ def find_author(mail): new_person = False try: - person = Person.objects.get(email = email) + person = Person.objects.get(email__iexact = email) except Person.DoesNotExist: person = Person(name = name, email = email) new_person = True @@ -103,7 +126,6 @@ def find_author(mail): def mail_date(mail): t = parsedate_tz(mail.get('Date', '')) if not t: - print "using now()" return datetime.datetime.utcnow() return datetime.datetime.utcfromtimestamp(mktime_tz(t)) @@ -121,21 +143,21 @@ def find_content(project, mail): if part.get_content_maintype() != 'text': continue - #print "\t%s, %s" % \ - # (part.get_content_subtype(), part.get_content_charset()) - + payload = part.get_payload(decode=True) charset = part.get_content_charset() - if not charset: - charset = mail.get_charset() - if not charset: + subtype = part.get_content_subtype() + + # if we don't have a charset, assume utf-8 + if charset is None: charset = 'utf-8' - payload = unicode(part.get_payload(decode=True), charset, "replace") + if not isinstance(payload, unicode): + payload = unicode(payload, charset) - if part.get_content_subtype() == 'x-patch': + if subtype in ['x-patch', 'x-diff']: patchbuf = payload - if part.get_content_subtype() == 'plain': + elif subtype == 'plain': if not patchbuf: (patchbuf, c) = parse_patch(payload) else: @@ -157,7 +179,7 @@ def find_content(project, mail): if patch: cpatch = patch else: - cpatch = find_patch_for_comment(mail) + cpatch = find_patch_for_comment(project, mail) if not cpatch: return (None, None) comment = Comment(patch = cpatch, date = mail_date(mail), @@ -166,7 +188,7 @@ def find_content(project, mail): return (patch, comment) -def find_patch_for_comment(mail): +def find_patch_for_comment(project, mail): # construct a list of possible reply message ids refs = [] if 'In-Reply-To' in mail: @@ -184,14 +206,14 @@ def find_patch_for_comment(mail): # first, check for a direct reply try: - patch = Patch.objects.get(msgid = ref) + patch = Patch.objects.get(project = project, msgid = ref) return patch except Patch.DoesNotExist: pass # see if we have comments that refer to a patch try: - comment = Comment.objects.get(msgid = ref) + comment = Comment.objects.get(patch__project = project, msgid = ref) return comment.patch except Comment.DoesNotExist: pass @@ -224,7 +246,6 @@ def split_prefixes(prefix): re_re = re.compile('^(re|fwd?)[:\s]\s*', re.I) prefix_re = re.compile('^\[([^\]]*)\]\s*(.*)$') -whitespace_re = re.compile('\s+') def clean_subject(subject, drop_prefixes = None): """ Clean a Subject: header from an incoming patch. @@ -276,8 +297,7 @@ def clean_subject(subject, drop_prefixes = None): # remove Re:, Fwd:, etc subject = re_re.sub(' ', subject) - # normalise whitespace - subject = whitespace_re.sub(' ', subject) + subject = normalise_space(subject) prefixes = [] @@ -291,7 +311,7 @@ def clean_subject(subject, drop_prefixes = None): subject = match.group(2) match = prefix_re.match(subject) - subject = whitespace_re.sub(' ', subject) + subject = normalise_space(subject) subject = subject.strip() if prefixes: @@ -299,13 +319,13 @@ def clean_subject(subject, drop_prefixes = None): return subject -sig_re = re.compile('^(-{2,3} ?|_+)\n.*', re.S | re.M) +sig_re = re.compile('^(-- |_+)\n.*', re.S | re.M) def clean_content(str): + """ Try to remove signature (-- ) and list footer (_____) cruft """ str = sig_re.sub('', str) return str.strip() -def main(args): - mail = message_from_file(sys.stdin) +def parse_mail(mail): # some basic sanity checks if 'From' not in mail: @@ -361,5 +381,9 @@ def main(args): return 0 +def main(args): + mail = message_from_file(sys.stdin) + return parse_mail(mail) + if __name__ == '__main__': sys.exit(main(sys.argv))