git.ozlabs.org Git - patchwork/blob - apps/patchwork/bin/parsemail.py

   1 #!/usr/bin/python
   2 #
   3 # Patchwork - automated patch tracking system
   4 # Copyright (C) 2008 Jeremy Kerr <jk@ozlabs.org>
   5 #
   6 # This file is part of the Patchwork package.
   7 #
   8 # Patchwork is free software; you can redistribute it and/or modify
   9 # it under the terms of the GNU General Public License as published by
  10 # the Free Software Foundation; either version 2 of the License, or
  11 # (at your option) any later version.
  12 #
  13 # Patchwork is distributed in the hope that it will be useful,
  14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 # GNU General Public License for more details.
  17 #
  18 # You should have received a copy of the GNU General Public License
  19 # along with Patchwork; if not, write to the Free Software
  20 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  21
  22 import sys
  23 import re
  24 import datetime
  25 import time
  26 import operator
  27 from email import message_from_file
  28 try:
  29     from email.header import Header
  30     from email.utils import parsedate_tz, mktime_tz
  31 except ImportError:
  32     # Python 2.4 compatibility
  33     from email.Header import Header
  34     from email.Utils import parsedate_tz, mktime_tz
  35
  36 from patchwork.parser import parse_patch
  37 from patchwork.models import Patch, Project, Person, Comment
  38
  39 list_id_headers = ['List-ID', 'X-Mailing-List']
  40
  41 def find_project(mail):
  42     project = None
  43     listid_re = re.compile('.*<([^>]+)>.*', re.S)
  44
  45     for header in list_id_headers:
  46         if header in mail:
  47             match = listid_re.match(mail.get(header))
  48             if not match:
  49                 continue
  50
  51             listid = match.group(1)
  52
  53             try:
  54                 project = Project.objects.get(listid = listid)
  55                 break
  56             except:
  57                 pass
  58
  59     return project
  60
  61 def find_author(mail):
  62
  63     from_header = mail.get('From').strip()
  64     (name, email) = (None, None)
  65
  66     # tuple of (regex, fn)
  67     #  - where fn returns a (name, email) tuple from the match groups resulting
  68     #    from re.match().groups()
  69     from_res = [
  70         # for "Firstname Lastname" <example@example.com> style addresses
  71        (re.compile('"?(.*?)"?\s*<([^>]+)>'), (lambda g: (g[0], g[1]))),
  72
  73        # for example@example.com (Firstname Lastname) style addresses
  74        (re.compile('"?(.*?)"?\s*\(([^\)]+)\)'), (lambda g: (g[1], g[0]))),
  75
  76        # everything else
  77        (re.compile('(.*)'), (lambda g: (None, g[0]))),
  78     ]
  79
  80     for regex, fn in from_res:
  81         match = regex.match(from_header)
  82         if match:
  83             (name, email) = fn(match.groups())
  84             break
  85
  86     if email is None:
  87         raise Exception("Could not parse From: header")
  88
  89     email = email.strip()
  90     if name is not None:
  91         name = name.strip()
  92
  93     new_person = False
  94
  95     try:
  96         person = Person.objects.get(email = email)
  97     except Person.DoesNotExist:
  98         person = Person(name = name, email = email)
  99         new_person = True
 100
 101     return (person, new_person)
 102
 103 def mail_date(mail):
 104     t = parsedate_tz(mail.get('Date', ''))
 105     if not t:
 106         return datetime.datetime.utcnow()
 107     return datetime.datetime.utcfromtimestamp(mktime_tz(t))
 108
 109 def mail_headers(mail):
 110     return reduce(operator.__concat__,
 111             ['%s: %s\n' % (k, Header(v, header_name = k, \
 112                     continuation_ws = '\t').encode()) \
 113                 for (k, v) in mail.items()])
 114
 115 def find_content(project, mail):
 116     patchbuf = None
 117     commentbuf = ''
 118
 119     for part in mail.walk():
 120         if part.get_content_maintype() != 'text':
 121             continue
 122
 123         #print "\t%s, %s" % \
 124         #    (part.get_content_subtype(), part.get_content_charset())
 125
 126         charset = part.get_content_charset()
 127         if not charset:
 128             charset = mail.get_charset()
 129         if not charset:
 130             charset = 'utf-8'
 131
 132         payload = unicode(part.get_payload(decode=True), charset, "replace")
 133
 134         if part.get_content_subtype() == 'x-patch':
 135             patchbuf = payload
 136
 137         if part.get_content_subtype() == 'plain':
 138             if not patchbuf:
 139                 (patchbuf, c) = parse_patch(payload)
 140             else:
 141                 c = payload
 142
 143             if c is not None:
 144                 commentbuf += c.strip() + '\n'
 145
 146     patch = None
 147     comment = None
 148
 149     if patchbuf:
 150         mail_headers(mail)
 151         name = clean_subject(mail.get('Subject'), [project.linkname])
 152         patch = Patch(name = name, content = patchbuf,
 153                     date = mail_date(mail), headers = mail_headers(mail))
 154
 155     if commentbuf:
 156         if patch:
 157             cpatch = patch
 158         else:
 159             cpatch = find_patch_for_comment(mail)
 160             if not cpatch:
 161                 return (None, None)
 162         comment = Comment(patch = cpatch, date = mail_date(mail),
 163                 content = clean_content(commentbuf),
 164                 headers = mail_headers(mail))
 165
 166     return (patch, comment)
 167
 168 def find_patch_for_comment(mail):
 169     # construct a list of possible reply message ids
 170     refs = []
 171     if 'In-Reply-To' in mail:
 172         refs.append(mail.get('In-Reply-To'))
 173
 174     if 'References' in mail:
 175         rs = mail.get('References').split()
 176         rs.reverse()
 177         for r in rs:
 178             if r not in refs:
 179                 refs.append(r)
 180
 181     for ref in refs:
 182         patch = None
 183
 184         # first, check for a direct reply
 185         try:
 186             patch = Patch.objects.get(msgid = ref)
 187             return patch
 188         except Patch.DoesNotExist:
 189             pass
 190
 191         # see if we have comments that refer to a patch
 192         try:
 193             comment = Comment.objects.get(msgid = ref)
 194             return comment.patch
 195         except Comment.DoesNotExist:
 196             pass
 197
 198
 199     return None
 200
 201 split_re = re.compile('[,\s]+')
 202
 203 def split_prefixes(prefix):
 204     """ Turn a prefix string into a list of prefix tokens
 205
 206     >>> split_prefixes('PATCH')
 207     ['PATCH']
 208     >>> split_prefixes('PATCH,RFC')
 209     ['PATCH', 'RFC']
 210     >>> split_prefixes('')
 211     []
 212     >>> split_prefixes('PATCH,')
 213     ['PATCH']
 214     >>> split_prefixes('PATCH ')
 215     ['PATCH']
 216     >>> split_prefixes('PATCH,RFC')
 217     ['PATCH', 'RFC']
 218     >>> split_prefixes('PATCH 1/2')
 219     ['PATCH', '1/2']
 220     """
 221     matches = split_re.split(prefix)
 222     return [ s for s in matches if s != '' ]
 223
 224 re_re = re.compile('^(re|fwd?)[:\s]\s*', re.I)
 225 prefix_re = re.compile('^\[([^\]]*)\]\s*(.*)$')
 226 whitespace_re = re.compile('\s+')
 227
 228 def clean_subject(subject, drop_prefixes = None):
 229     """ Clean a Subject: header from an incoming patch.
 230
 231     Removes Re: and Fwd: strings, as well as [PATCH]-style prefixes. By
 232     default, only [PATCH] is removed, and we keep any other bracketed data
 233     in the subject. If drop_prefixes is provided, remove those too,
 234     comparing case-insensitively.
 235
 236     >>> clean_subject('meep')
 237     'meep'
 238     >>> clean_subject('Re: meep')
 239     'meep'
 240     >>> clean_subject('[PATCH] meep')
 241     'meep'
 242     >>> clean_subject('[PATCH] meep \\n meep')
 243     'meep meep'
 244     >>> clean_subject('[PATCH RFC] meep')
 245     '[RFC] meep'
 246     >>> clean_subject('[PATCH,RFC] meep')
 247     '[RFC] meep'
 248     >>> clean_subject('[PATCH,1/2] meep')
 249     '[1/2] meep'
 250     >>> clean_subject('[PATCH RFC 1/2] meep')
 251     '[RFC,1/2] meep'
 252     >>> clean_subject('[PATCH] [RFC] meep')
 253     '[RFC] meep'
 254     >>> clean_subject('[PATCH] [RFC,1/2] meep')
 255     '[RFC,1/2] meep'
 256     >>> clean_subject('[PATCH] [RFC] [1/2] meep')
 257     '[RFC,1/2] meep'
 258     >>> clean_subject('[PATCH] rewrite [a-z] regexes')
 259     'rewrite [a-z] regexes'
 260     >>> clean_subject('[PATCH] [RFC] rewrite [a-z] regexes')
 261     '[RFC] rewrite [a-z] regexes'
 262     >>> clean_subject('[foo] [bar] meep', ['foo'])
 263     '[bar] meep'
 264     >>> clean_subject('[FOO] [bar] meep', ['foo'])
 265     '[bar] meep'
 266     """
 267
 268     if drop_prefixes is None:
 269         drop_prefixes = []
 270     else:
 271         drop_prefixes = [ s.lower() for s in drop_prefixes ]
 272
 273     drop_prefixes.append('patch')
 274
 275     # remove Re:, Fwd:, etc
 276     subject = re_re.sub(' ', subject)
 277
 278     # normalise whitespace
 279     subject = whitespace_re.sub(' ', subject)
 280
 281     prefixes = []
 282
 283     match = prefix_re.match(subject)
 284
 285     while match:
 286         prefix_str = match.group(1)
 287         prefixes += [ p for p in split_prefixes(prefix_str) \
 288                         if p.lower() not in drop_prefixes]
 289
 290         subject = match.group(2)
 291         match = prefix_re.match(subject)
 292
 293     subject = whitespace_re.sub(' ', subject)
 294
 295     subject = subject.strip()
 296     if prefixes:
 297         subject = '[%s] %s' % (','.join(prefixes), subject)
 298
 299     return subject
 300
 301 sig_re = re.compile('^(-{2,3} ?|_+)\n.*', re.S | re.M)
 302 def clean_content(str):
 303     str = sig_re.sub('', str)
 304     return str.strip()
 305
 306 def main(args):
 307     mail = message_from_file(sys.stdin)
 308
 309     # some basic sanity checks
 310     if 'From' not in mail:
 311         return 0
 312
 313     if 'Subject' not in mail:
 314         return 0
 315
 316     if 'Message-Id' not in mail:
 317         return 0
 318
 319     hint = mail.get('X-Patchwork-Hint', '').lower()
 320     if hint == 'ignore':
 321         return 0;
 322
 323     project = find_project(mail)
 324     if project is None:
 325         print "no project found"
 326         return 0
 327
 328     msgid = mail.get('Message-Id').strip()
 329
 330     (author, save_required) = find_author(mail)
 331
 332     (patch, comment) = find_content(project, mail)
 333
 334     if patch:
 335         # we delay the saving until we know we have a patch.
 336         if save_required:
 337             author.save()
 338             save_required = False
 339         patch.submitter = author
 340         patch.msgid = msgid
 341         patch.project = project
 342         try:
 343             patch.save()
 344         except Exception, ex:
 345             print str(ex)
 346
 347     if comment:
 348         if save_required:
 349             author.save()
 350         # looks like the original constructor for Comment takes the pk
 351         # when the Comment is created. reset it here.
 352         if patch:
 353             comment.patch = patch
 354         comment.submitter = author
 355         comment.msgid = msgid
 356         try:
 357             comment.save()
 358         except Exception, ex:
 359             print str(ex)
 360
 361     return 0
 362
 363 if __name__ == '__main__':
 364     sys.exit(main(sys.argv))