X-Git-Url: http://git.ozlabs.org/?a=blobdiff_plain;f=apps%2Fpatchwork%2Fbin%2Fparsemail.py;h=19e6e57d5214fa309f5f27068df726c6f3696dfa;hb=b08229de74367b95f851f7bb2c89781022df4101;hp=d0993acfc2f641a790c08f35917cbb669111351b;hpb=81c5a915f210b70451c970d223430804d501ddc3;p=patchwork diff --git a/apps/patchwork/bin/parsemail.py b/apps/patchwork/bin/parsemail.py index d0993ac..19e6e57 100755 --- a/apps/patchwork/bin/parsemail.py +++ b/apps/patchwork/bin/parsemail.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # Patchwork - automated patch tracking system # Copyright (C) 2008 Jeremy Kerr @@ -24,22 +24,53 @@ import re import datetime import time import operator +import codecs from email import message_from_file -from email.header import Header -from email.utils import parsedate_tz, mktime_tz +try: + from email.header import Header, decode_header + from email.utils import parsedate_tz, mktime_tz +except ImportError: + # Python 2.4 compatibility + from email.Header import Header, decode_header + from email.Utils import parsedate_tz, mktime_tz -from patchparser import parse_patch -from patchwork.models import Patch, Project, Person, Comment +from patchwork.parser import parse_patch +from patchwork.models import Patch, Project, Person, Comment, State, \ + get_default_initial_patch_state +from django.contrib.auth.models import User -list_id_headers = ['List-ID', 'X-Mailing-List'] +list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list'] + +whitespace_re = re.compile('\s+') +def normalise_space(str): + return whitespace_re.sub(' ', str).strip() + +def clean_header(header): + """ Decode (possibly non-ascii) headers """ + + def decode(fragment): + (frag_str, frag_encoding) = fragment + if frag_encoding: + return frag_str.decode(frag_encoding) + return frag_str.decode() + + fragments = map(decode, decode_header(header)) + + return normalise_space(u' '.join(fragments)) def find_project(mail): project = None - listid_re = re.compile('.*<([^>]+)>.*', re.S) + listid_res = [re.compile('.*<([^>]+)>.*', re.S), + re.compile('^([\S]+)$', re.S)] for header in list_id_headers: if header in mail: - match = listid_re.match(mail.get(header)) + + for listid_re in listid_res: + match = listid_re.match(mail.get(header)) + if match: + break + if not match: continue @@ -55,7 +86,7 @@ def find_project(mail): def find_author(mail): - from_header = mail.get('From').strip() + from_header = clean_header(mail.get('From')) (name, email) = (None, None) # tuple of (regex, fn) @@ -88,7 +119,7 @@ def find_author(mail): new_person = False try: - person = Person.objects.get(email = email) + person = Person.objects.get(email__iexact = email) except Person.DoesNotExist: person = Person(name = name, email = email) new_person = True @@ -98,7 +129,6 @@ def find_author(mail): def mail_date(mail): t = parsedate_tz(mail.get('Date', '')) if not t: - print "using now()" return datetime.datetime.utcnow() return datetime.datetime.utcfromtimestamp(mktime_tz(t)) @@ -108,33 +138,74 @@ def mail_headers(mail): continuation_ws = '\t').encode()) \ for (k, v) in mail.items()]) +def find_pull_request(content): + git_re = re.compile('^The following changes since commit.*' + + '^are available in the git repository at:\n' + '^\s*([\S]+://[^\n]+)$', + re.DOTALL | re.MULTILINE) + match = git_re.search(content) + if match: + return match.group(1) + return None + +def try_decode(payload, charset): + try: + payload = unicode(payload, charset) + except UnicodeDecodeError: + return None + return payload + def find_content(project, mail): patchbuf = None commentbuf = '' + pullurl = None for part in mail.walk(): if part.get_content_maintype() != 'text': continue - #print "\t%s, %s" % \ - # (part.get_content_subtype(), part.get_content_charset()) + payload = part.get_payload(decode=True) + subtype = part.get_content_subtype() + + if not isinstance(payload, unicode): + charset = part.get_content_charset() + + # Check that we have a charset that we understand. Otherwise, + # ignore it and fallback to our standard set. + if charset is not None: + try: + codec = codecs.lookup(charset) + except LookupError: + charset = None - charset = part.get_content_charset() - if not charset: - charset = mail.get_charset() - if not charset: - charset = 'utf-8' + # If there is no charset or if it is unknown, then try some common + # charsets before we fail. + if charset is None: + try_charsets = ['utf-8', 'windows-1252', 'iso-8859-1'] + else: + try_charsets = [charset] + + for cset in try_charsets: + decoded_payload = try_decode(payload, cset) + if decoded_payload is not None: + break + payload = decoded_payload - payload = unicode(part.get_payload(decode=True), charset, "replace") + # Could not find a valid decoded payload. Fail. + if payload is None: + return (None, None) - if part.get_content_subtype() == 'x-patch': + if subtype in ['x-patch', 'x-diff']: patchbuf = payload - if part.get_content_subtype() == 'plain': + elif subtype == 'plain': + c = payload + if not patchbuf: (patchbuf, c) = parse_patch(payload) - else: - c = payload + + if not pullurl: + pullurl = find_pull_request(payload) if c is not None: commentbuf += c.strip() + '\n' @@ -142,17 +213,16 @@ def find_content(project, mail): patch = None comment = None - if patchbuf: - mail_headers(mail) - patch = Patch(name = clean_subject(mail.get('Subject')), - content = patchbuf, date = mail_date(mail), - headers = mail_headers(mail)) + if pullurl or patchbuf: + name = clean_subject(mail.get('Subject'), [project.linkname]) + patch = Patch(name = name, pull_url = pullurl, content = patchbuf, + date = mail_date(mail), headers = mail_headers(mail)) if commentbuf: if patch: - cpatch = patch - else: - cpatch = find_patch_for_comment(mail) + cpatch = patch + else: + cpatch = find_patch_for_comment(project, mail) if not cpatch: return (None, None) comment = Comment(patch = cpatch, date = mail_date(mail), @@ -161,7 +231,7 @@ def find_content(project, mail): return (patch, comment) -def find_patch_for_comment(mail): +def find_patch_for_comment(project, mail): # construct a list of possible reply message ids refs = [] if 'In-Reply-To' in mail: @@ -179,14 +249,14 @@ def find_patch_for_comment(mail): # first, check for a direct reply try: - patch = Patch.objects.get(msgid = ref) + patch = Patch.objects.get(project = project, msgid = ref) return patch except Patch.DoesNotExist: pass # see if we have comments that refer to a patch try: - comment = Comment.objects.get(msgid = ref) + comment = Comment.objects.get(patch__project = project, msgid = ref) return comment.patch except Comment.DoesNotExist: pass @@ -194,23 +264,131 @@ def find_patch_for_comment(mail): return None -re_re = re.compile('^(re|fwd?)[:\s]\s*', re.I) -prefix_re = re.compile('^\[.*\]\s*') -whitespace_re = re.compile('\s+') +split_re = re.compile('[,\s]+') + +def split_prefixes(prefix): + """ Turn a prefix string into a list of prefix tokens + + >>> split_prefixes('PATCH') + ['PATCH'] + >>> split_prefixes('PATCH,RFC') + ['PATCH', 'RFC'] + >>> split_prefixes('') + [] + >>> split_prefixes('PATCH,') + ['PATCH'] + >>> split_prefixes('PATCH ') + ['PATCH'] + >>> split_prefixes('PATCH,RFC') + ['PATCH', 'RFC'] + >>> split_prefixes('PATCH 1/2') + ['PATCH', '1/2'] + """ + matches = split_re.split(prefix) + return [ s for s in matches if s != '' ] -def clean_subject(subject): +re_re = re.compile('^(re|fwd?)[:\s]\s*', re.I) +prefix_re = re.compile('^\[([^\]]*)\]\s*(.*)$') + +def clean_subject(subject, drop_prefixes = None): + """ Clean a Subject: header from an incoming patch. + + Removes Re: and Fwd: strings, as well as [PATCH]-style prefixes. By + default, only [PATCH] is removed, and we keep any other bracketed data + in the subject. If drop_prefixes is provided, remove those too, + comparing case-insensitively. + + >>> clean_subject('meep') + 'meep' + >>> clean_subject('Re: meep') + 'meep' + >>> clean_subject('[PATCH] meep') + 'meep' + >>> clean_subject('[PATCH] meep \\n meep') + 'meep meep' + >>> clean_subject('[PATCH RFC] meep') + '[RFC] meep' + >>> clean_subject('[PATCH,RFC] meep') + '[RFC] meep' + >>> clean_subject('[PATCH,1/2] meep') + '[1/2] meep' + >>> clean_subject('[PATCH RFC 1/2] meep') + '[RFC,1/2] meep' + >>> clean_subject('[PATCH] [RFC] meep') + '[RFC] meep' + >>> clean_subject('[PATCH] [RFC,1/2] meep') + '[RFC,1/2] meep' + >>> clean_subject('[PATCH] [RFC] [1/2] meep') + '[RFC,1/2] meep' + >>> clean_subject('[PATCH] rewrite [a-z] regexes') + 'rewrite [a-z] regexes' + >>> clean_subject('[PATCH] [RFC] rewrite [a-z] regexes') + '[RFC] rewrite [a-z] regexes' + >>> clean_subject('[foo] [bar] meep', ['foo']) + '[bar] meep' + >>> clean_subject('[FOO] [bar] meep', ['foo']) + '[bar] meep' + """ + + subject = clean_header(subject) + + if drop_prefixes is None: + drop_prefixes = [] + else: + drop_prefixes = [ s.lower() for s in drop_prefixes ] + + drop_prefixes.append('patch') + + # remove Re:, Fwd:, etc subject = re_re.sub(' ', subject) - subject = prefix_re.sub('', subject) - subject = whitespace_re.sub(' ', subject) - return subject.strip() -sig_re = re.compile('^(-{2,3} ?|_+)\n.*', re.S | re.M) + subject = normalise_space(subject) + + prefixes = [] + + match = prefix_re.match(subject) + + while match: + prefix_str = match.group(1) + prefixes += [ p for p in split_prefixes(prefix_str) \ + if p.lower() not in drop_prefixes] + + subject = match.group(2) + match = prefix_re.match(subject) + + subject = normalise_space(subject) + + subject = subject.strip() + if prefixes: + subject = '[%s] %s' % (','.join(prefixes), subject) + + return subject + +sig_re = re.compile('^(-- |_+)\n.*', re.S | re.M) def clean_content(str): + """ Try to remove signature (-- ) and list footer (_____) cruft """ str = sig_re.sub('', str) return str.strip() -def main(args): - mail = message_from_file(sys.stdin) +def get_state(state_name): + """ Return the state with the given name or the default State """ + if state_name: + try: + return State.objects.get(name__iexact=state_name) + except State.DoesNotExist: + pass + return get_default_initial_patch_state() + +def get_delegate(delegate_email): + """ Return the delegate with the given email or None """ + if delegate_email: + try: + return User.objects.get(email__iexact=delegate_email) + except User.DoesNotExist: + pass + return None + +def parse_mail(mail): # some basic sanity checks if 'From' not in mail: @@ -245,10 +423,13 @@ def main(args): patch.submitter = author patch.msgid = msgid patch.project = project + patch.state = get_state(mail.get('X-Patchwork-State', '').strip()) + patch.delegate = get_delegate( + mail.get('X-Patchwork-Delegate', '').strip()) try: patch.save() except Exception, ex: - print ex.message + print str(ex) if comment: if save_required: @@ -262,9 +443,13 @@ def main(args): try: comment.save() except Exception, ex: - print ex.message + print str(ex) return 0 +def main(args): + mail = message_from_file(sys.stdin) + return parse_mail(mail) + if __name__ == '__main__': sys.exit(main(sys.argv))