git.ozlabs.org Git - patchwork/blob - apps/patchwork/bin/parsemail.py

   1 #!/usr/bin/python
   2 #
   3 # Patchwork - automated patch tracking system
   4 # Copyright (C) 2008 Jeremy Kerr <jk@ozlabs.org>
   5 #
   6 # This file is part of the Patchwork package.
   7 #
   8 # Patchwork is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # Patchwork is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with Patchwork; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 import sys
  23 import re
  24 import datetime
  25 import time
  26 import operator
  27 from email import message_from_file
  28 try:
  29     from email.header import Header, decode_header
  30     from email.utils import parsedate_tz, mktime_tz
  31 except ImportError:
  32     # Python 2.4 compatibility
  33     from email.Header import Header, decode_header
  34     from email.Utils import parsedate_tz, mktime_tz
  35
  36 from patchwork.parser import parse_patch
  37 from patchwork.models import Patch, Project, Person, Comment
  38
  39 list_id_headers = ['List-ID', 'X-Mailing-List', 'X-list']
  40
  41 whitespace_re = re.compile('\s+')
  42 def normalise_space(str):
  43     return whitespace_re.sub(' ', str).strip()
  44
  45 def clean_header(header):
  46     """ Decode (possibly non-ascii) headers """
  47
  48     def decode(fragment):
  49         (frag_str, frag_encoding) = fragment
  50         if frag_encoding:
  51             return frag_str.decode(frag_encoding)
  52         return frag_str.decode()
  53
  54     fragments = map(decode, decode_header(header))
  55
  56     return normalise_space(u' '.join(fragments))
  57
  58 def find_project(mail):
  59     project = None
  60     listid_res = [re.compile('.*<([^>]+)>.*', re.S),
  61                   re.compile('^([\S]+)$', re.S)]
  62
  63     for header in list_id_headers:
  64         if header in mail:
  65
  66             for listid_re in listid_res:
  67                 match = listid_re.match(mail.get(header))
  68                 if match:
  69                     break
  70
  71             if not match:
  72                 continue
  73
  74             listid = match.group(1)
  75
  76             try:
  77                 project = Project.objects.get(listid = listid)
  78                 break
  79             except:
  80                 pass
  81
  82     return project
  83
  84 def find_author(mail):
  85
  86     from_header = clean_header(mail.get('From'))
  87     (name, email) = (None, None)
  88
  89     # tuple of (regex, fn)
  90     #  - where fn returns a (name, email) tuple from the match groups resulting
  91     #    from re.match().groups()
  92     from_res = [
  93         # for "Firstname Lastname" <example@example.com> style addresses
  94        (re.compile('"?(.*?)"?\s*<([^>]+)>'), (lambda g: (g[0], g[1]))),
  95
  96        # for example@example.com (Firstname Lastname) style addresses
  97        (re.compile('"?(.*?)"?\s*\(([^\)]+)\)'), (lambda g: (g[1], g[0]))),
  98
  99        # everything else
 100        (re.compile('(.*)'), (lambda g: (None, g[0]))),
 101     ]
 102
 103     for regex, fn in from_res:
 104         match = regex.match(from_header)
 105         if match:
 106             (name, email) = fn(match.groups())
 107             break
 108
 109     if email is None:
 110         raise Exception("Could not parse From: header")
 111
 112     email = email.strip()
 113     if name is not None:
 114         name = name.strip()
 115
 116     new_person = False
 117
 118     try:
 119         person = Person.objects.get(email__iexact = email)
 120     except Person.DoesNotExist:
 121         person = Person(name = name, email = email)
 122         new_person = True
 123
 124     return (person, new_person)
 125
 126 def mail_date(mail):
 127     t = parsedate_tz(mail.get('Date', ''))
 128     if not t:
 129         return datetime.datetime.utcnow()
 130     return datetime.datetime.utcfromtimestamp(mktime_tz(t))
 131
 132 def mail_headers(mail):
 133     return reduce(operator.__concat__,
 134             ['%s: %s\n' % (k, Header(v, header_name = k, \
 135                     continuation_ws = '\t').encode()) \
 136                 for (k, v) in mail.items()])
 137
 138 def find_pull_request(content):
 139     git_re = re.compile('^The following changes since commit .*' +
 140                         '^are available in the git repository at:\n'
 141                         '^\s*(git://[^\n]+)$',
 142                            re.DOTALL | re.MULTILINE)
 143     match = git_re.search(content)
 144     if match:
 145         return match.group(1)
 146     return None
 147
 148 def find_content(project, mail):
 149     patchbuf = None
 150     commentbuf = ''
 151     pullurl = None
 152
 153     for part in mail.walk():
 154         if part.get_content_maintype() != 'text':
 155             continue
 156
 157         payload = part.get_payload(decode=True)
 158         charset = part.get_content_charset()
 159         subtype = part.get_content_subtype()
 160
 161         # if we don't have a charset, assume utf-8
 162         if charset is None:
 163             charset = 'utf-8'
 164
 165         if not isinstance(payload, unicode):
 166             payload = unicode(payload, charset)
 167
 168         if subtype in ['x-patch', 'x-diff']:
 169             patchbuf = payload
 170
 171         elif subtype == 'plain':
 172             c = payload
 173
 174             if not patchbuf:
 175                 (patchbuf, c) = parse_patch(payload)
 176
 177             if not pullurl:
 178                 pullurl = find_pull_request(payload)
 179
 180             if c is not None:
 181                 commentbuf += c.strip() + '\n'
 182
 183     patch = None
 184     comment = None
 185
 186     if patchbuf:
 187         mail_headers(mail)
 188         name = clean_subject(mail.get('Subject'), [project.linkname])
 189         patch = Patch(name = name, content = patchbuf,
 190                     date = mail_date(mail), headers = mail_headers(mail))
 191
 192     if pullurl:
 193         name = clean_subject(mail.get('Subject'), [project.linkname])
 194         patch = Patch(name = name, pull_url = pullurl,
 195                     date = mail_date(mail), headers = mail_headers(mail))
 196
 197     if commentbuf:
 198         if patch:
 199             cpatch = patch
 200         else:
 201             cpatch = find_patch_for_comment(project, mail)
 202             if not cpatch:
 203                 return (None, None)
 204         comment = Comment(patch = cpatch, date = mail_date(mail),
 205                 content = clean_content(commentbuf),
 206                 headers = mail_headers(mail))
 207
 208     return (patch, comment)
 209
 210 def find_patch_for_comment(project, mail):
 211     # construct a list of possible reply message ids
 212     refs = []
 213     if 'In-Reply-To' in mail:
 214         refs.append(mail.get('In-Reply-To'))
 215
 216     if 'References' in mail:
 217         rs = mail.get('References').split()
 218         rs.reverse()
 219         for r in rs:
 220             if r not in refs:
 221                 refs.append(r)
 222
 223     for ref in refs:
 224         patch = None
 225
 226         # first, check for a direct reply
 227         try:
 228             patch = Patch.objects.get(project = project, msgid = ref)
 229             return patch
 230         except Patch.DoesNotExist:
 231             pass
 232
 233         # see if we have comments that refer to a patch
 234         try:
 235             comment = Comment.objects.get(patch__project = project, msgid = ref)
 236             return comment.patch
 237         except Comment.DoesNotExist:
 238             pass
 239
 240
 241     return None
 242
 243 split_re = re.compile('[,\s]+')
 244
 245 def split_prefixes(prefix):
 246     """ Turn a prefix string into a list of prefix tokens
 247
 248     >>> split_prefixes('PATCH')
 249     ['PATCH']
 250     >>> split_prefixes('PATCH,RFC')
 251     ['PATCH', 'RFC']
 252     >>> split_prefixes('')
 253     []
 254     >>> split_prefixes('PATCH,')
 255     ['PATCH']
 256     >>> split_prefixes('PATCH ')
 257     ['PATCH']
 258     >>> split_prefixes('PATCH,RFC')
 259     ['PATCH', 'RFC']
 260     >>> split_prefixes('PATCH 1/2')
 261     ['PATCH', '1/2']
 262     """
 263     matches = split_re.split(prefix)
 264     return [ s for s in matches if s != '' ]
 265
 266 re_re = re.compile('^(re|fwd?)[:\s]\s*', re.I)
 267 prefix_re = re.compile('^\[([^\]]*)\]\s*(.*)$')
 268
 269 def clean_subject(subject, drop_prefixes = None):
 270     """ Clean a Subject: header from an incoming patch.
 271
 272     Removes Re: and Fwd: strings, as well as [PATCH]-style prefixes. By
 273     default, only [PATCH] is removed, and we keep any other bracketed data
 274     in the subject. If drop_prefixes is provided, remove those too,
 275     comparing case-insensitively.
 276
 277     >>> clean_subject('meep')
 278     'meep'
 279     >>> clean_subject('Re: meep')
 280     'meep'
 281     >>> clean_subject('[PATCH] meep')
 282     'meep'
 283     >>> clean_subject('[PATCH] meep \\n meep')
 284     'meep meep'
 285     >>> clean_subject('[PATCH RFC] meep')
 286     '[RFC] meep'
 287     >>> clean_subject('[PATCH,RFC] meep')
 288     '[RFC] meep'
 289     >>> clean_subject('[PATCH,1/2] meep')
 290     '[1/2] meep'
 291     >>> clean_subject('[PATCH RFC 1/2] meep')
 292     '[RFC,1/2] meep'
 293     >>> clean_subject('[PATCH] [RFC] meep')
 294     '[RFC] meep'
 295     >>> clean_subject('[PATCH] [RFC,1/2] meep')
 296     '[RFC,1/2] meep'
 297     >>> clean_subject('[PATCH] [RFC] [1/2] meep')
 298     '[RFC,1/2] meep'
 299     >>> clean_subject('[PATCH] rewrite [a-z] regexes')
 300     'rewrite [a-z] regexes'
 301     >>> clean_subject('[PATCH] [RFC] rewrite [a-z] regexes')
 302     '[RFC] rewrite [a-z] regexes'
 303     >>> clean_subject('[foo] [bar] meep', ['foo'])
 304     '[bar] meep'
 305     >>> clean_subject('[FOO] [bar] meep', ['foo'])
 306     '[bar] meep'
 307     """
 308
 309     if drop_prefixes is None:
 310         drop_prefixes = []
 311     else:
 312         drop_prefixes = [ s.lower() for s in drop_prefixes ]
 313
 314     drop_prefixes.append('patch')
 315
 316     # remove Re:, Fwd:, etc
 317     subject = re_re.sub(' ', subject)
 318
 319     subject = normalise_space(subject)
 320
 321     prefixes = []
 322
 323     match = prefix_re.match(subject)
 324
 325     while match:
 326         prefix_str = match.group(1)
 327         prefixes += [ p for p in split_prefixes(prefix_str) \
 328                         if p.lower() not in drop_prefixes]
 329
 330         subject = match.group(2)
 331         match = prefix_re.match(subject)
 332
 333     subject = normalise_space(subject)
 334
 335     subject = subject.strip()
 336     if prefixes:
 337         subject = '[%s] %s' % (','.join(prefixes), subject)
 338
 339     return subject
 340
 341 sig_re = re.compile('^(-- |_+)\n.*', re.S | re.M)
 342 def clean_content(str):
 343     """ Try to remove signature (-- ) and list footer (_____) cruft """
 344     str = sig_re.sub('', str)
 345     return str.strip()
 346
 347 def parse_mail(mail):
 348
 349     # some basic sanity checks
 350     if 'From' not in mail:
 351         return 0
 352
 353     if 'Subject' not in mail:
 354         return 0
 355
 356     if 'Message-Id' not in mail:
 357         return 0
 358
 359     hint = mail.get('X-Patchwork-Hint', '').lower()
 360     if hint == 'ignore':
 361         return 0;
 362
 363     project = find_project(mail)
 364     if project is None:
 365         print "no project found"
 366         return 0
 367
 368     msgid = mail.get('Message-Id').strip()
 369
 370     (author, save_required) = find_author(mail)
 371
 372     (patch, comment) = find_content(project, mail)
 373
 374     if patch:
 375         # we delay the saving until we know we have a patch.
 376         if save_required:
 377             author.save()
 378             save_required = False
 379         patch.submitter = author
 380         patch.msgid = msgid
 381         patch.project = project
 382         try:
 383             patch.save()
 384         except Exception, ex:
 385             print str(ex)
 386
 387     if comment:
 388         if save_required:
 389             author.save()
 390         # looks like the original constructor for Comment takes the pk
 391         # when the Comment is created. reset it here.
 392         if patch:
 393             comment.patch = patch
 394         comment.submitter = author
 395         comment.msgid = msgid
 396         try:
 397             comment.save()
 398         except Exception, ex:
 399             print str(ex)
 400
 401     return 0
 402
 403 def main(args):
 404     mail = message_from_file(sys.stdin)
 405     return parse_mail(mail)
 406
 407 if __name__ == '__main__':
 408     sys.exit(main(sys.argv))