# > !NewsStats.py.Analyse/py

import os
import re
import rfc822
import string
import types
import wimp
import mx.DateTime
import MainW
import Newsbase

# Don't analyse these headers
ignore_headers = ('date',
                  'message-id',
                  'nntp-posting-date',
                  'path',
                  'received',
                  'references',
                  'xref',
                  'x-trace')
# Some regular expressions
lt = re.compile('<')
gt = re.compile('>')
ref = re.compile('^Re: ', re.IGNORECASE)
date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )")
date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )")
space = re.compile("(\n\s*)|(\t+)")

extension = '/html'


def group(group, path, fast):
    "Analyse a group"
    global articles
    article_numbers = []
    MainW.set_status(group, '', '0')
    articles = Newsbase.ListArts(group)
    if isinstance(articles, types.StringType):
        return articles
    if articles == []:
        return 0
    MainW.set_status(group, '?', str(len(articles)))
    init()
    if fast:
        wimp.hourglass_on()
        h = {}
        for article in articles:
            h['from'] = article[2]
            h['subject'] = article[3]
            h['date'] = article[5]
            process_headers(h)
        end(group, path, 1)
    else:
        art_no = 0
        for article in articles:
            wimp.handle(wimp.poll(0))
            success, details = Newsbase.GetArticle(group, article[0])
            if success:
                file(details)
                MainW.set_status(group, str(art_no), str(len(articles)))
                art_no = art_no + 1
            else:
                abort()
                return details
        end(group, path)
    wimp.hourglass_off()


def init():
    "Get ready to analyse a group"
    global headers, first, last
    headers = {}
    first = {}
    last = {}


def file(file):
    "Load an article file"

    # Open the article
    f = open(file, 'r')

    # Read in the headers
    m = rfc822.Message(f)
    process_headers(m)
    
    # Close file
    f.close()


def process_headers(m):
    "Process a dictionary of headers"
    # Increment the header dictionaries    
    for (header, content) in m.items():
        if '\n' in content or '\t' in content:
            content = re.sub(space, ' ', content)
        if not header in ignore_headers:
            if headers.has_key(header):
                header_d = headers[header]
            else:
                header_d = {}
                headers[header] = header_d
            if header == 'subject':
                content = re.sub(ref, '', content)
            elif header == 'lines':
                content = eval(content)
            if header_d.has_key(content):
                header_d[content] = header_d[content] + 1
            else:
                header_d[content] = 1
    
    # Check dates for durations
    if m.has_key('subject') and m.has_key('date'):
        subject = re.sub(ref, '', m['subject'])
        # fix century-less dates
        date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date']))
        try:
            time = mx.DateTime.ARPA.ParseDateTime(date)
            if first.has_key(subject):
                if time < first[subject]:
                    first[subject] = time
            else:
                first[subject] = time
            if last.has_key(subject):
                if time > last[subject]:
                    last[subject] = time
            else:
                last[subject] = time
        except ValueError:
            # malformed dates raise ValueError
            pass


def end(group, dir, fast = 0):
    "Analyse the collected data and write to file"
    global headers, first, last

    # Calculate durations
    durations = {}
    for subject in headers['subject'].items():
        try:
            durations[subject[0]] = last[subject[0]] - first[subject[0]]
        except KeyError:
            pass
    
    # Sort the headers
    headers_list = headers.items()
    if not fast:
        headers_list.append(('=', {}))
    headers_list.append(('duration', durations))
    headers_list.sort(_headersort)
    
    # Start the index file
    os.makedirs(dir)
    o = open(dir + '.index' + extension, 'w')
    o.write((wimp.m['output.index.head'] + '\n') % (group, group))

    # Write a file for each header
    header_number = 1
    for header in headers_list:
        if header[0] == '=':
            o.write(wimp.m['output.index.separator'] + '\n')
            continue
        
        o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0]))

        p = open(dir + '.' + str(header_number) + extension, 'w')
        p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0]))
    
        list = header[1].items()
        if header[0] == 'lines':
            list.sort()
            lines_previous = list[0][0] - 1
        else:
            list.sort(_valuesort)
        value_length = len(str(list[0][1]))
        for item in list:
            text = (str(item[0]) + ' ' * 64)[:63]
            if '<' in text:
                text = re.sub(lt, '&lt;', text)
            if '>' in text:
                text = re.sub(gt, '&gt;', text)
            if header[0] == 'lines':
                if lines_previous != item[0] - 1:
                    p.write(wimp.m['output.page.gap'] + '\n')
                lines_previous = item[0]
            if header[0] == 'duration':
                duration = int(item[1].hours)
                if duration == 0:
                    p.write((wimp.m['output.page.line-0'] + '\n') %
                        (text, (' ' * value_length + str(item[1]))[-value_length:]))
                else:
                    p.write((wimp.m['output.page.line-d'] + '\n') %
                        (text, (' ' * value_length + str(item[1]))[-value_length:], duration))
            else:
                p.write((wimp.m['output.page.line'] + '\n') %
                    (text, (' ' * value_length + str(item[1]))[-value_length:], item[1]))
        p.write(wimp.m['output.page.foot'] + '\n')
        
        p.close()
        os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF')

        header_number = header_number + 1

    # Finish the file
    o.write(wimp.m['output.index.foot'] + '\n')

    # Close the output file
    o.close()
    os.system('SetType ' + dir + '.index' + extension + ' FAF')
    
    del headers, first, last


def abort():
    "Abort analysis"
    global headers, first, last
    del headers, first, last


def _headersort(item1, item2):
    "Sort the headers"
    if item1[0] == 'subject':
        return -1
    elif item2[0] == 'subject':
        return 1
    elif item1[0] == 'from':
        return -1
    elif item2[0] == 'from':
        return 1
    elif item1[0] == 'lines':
        return -1
    elif item2[0] == 'lines':
        return 1
    elif item1[0] == 'duration':
        return -1
    elif item2[0] == 'duration':
        return 1
    elif item1[0] == '=':
        return -1
    elif item2[0] == '=':
        return 1
    elif item1[0] < item2[0]:
        return -1
    return 1


def _valuesort(item1, item2):
    "Sort a former dictionary"
    if item1[1] < item2[1]:
        return 1
    elif item1[1] == item2[1]:
        s1 = string.lower(item1[0])
        s2 = string.lower(item2[0])
        if s1 > s2:
            return 1
        elif s1 == s2:
            return 0
    return -1