# > !NewsStats.py.Analyse/py import os import re import rfc822 import string import types import wimp import mx.DateTime import MainW import Newsbase # Don't analyse these headers ignore_headers = ('date', 'message-id', 'nntp-posting-date', 'path', 'received', 'references', 'xref', 'x-trace') # Some regular expressions lt = re.compile('<') gt = re.compile('>') ref = re.compile('^Re: ', re.IGNORECASE) date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )") date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )") space = re.compile("(\n\s*)|(\t+)") extension = '/html' def group(group, path, fast): "Analyse a group" global articles article_numbers = [] MainW.set_status(group, '', '0') articles = Newsbase.ListArts(group) if isinstance(articles, types.StringType): return articles if articles == []: return 0 MainW.set_status(group, '?', str(len(articles))) init() if fast: wimp.hourglass_on() h = {} for article in articles: h['from'] = article[2] h['subject'] = article[3] h['date'] = article[5] process_headers(h) end(group, path, 1) else: art_no = 0 for article in articles: wimp.handle(wimp.poll(0)) success, details = Newsbase.GetArticle(group, article[0]) if success: file(details) MainW.set_status(group, str(art_no), str(len(articles))) art_no = art_no + 1 else: abort() return details end(group, path) wimp.hourglass_off() def init(): "Get ready to analyse a group" global headers, first, last headers = {} first = {} last = {} def file(file): "Load an article file" # Open the article f = open(file, 'r') # Read in the headers m = rfc822.Message(f) process_headers(m) # Close file f.close() def process_headers(m): "Process a dictionary of headers" # Increment the header dictionaries for (header, content) in m.items(): if '\n' in content or '\t' in content: content = re.sub(space, ' ', content) if not header in ignore_headers: if headers.has_key(header): header_d = headers[header] else: header_d = {} headers[header] = header_d if header == 'subject': content = re.sub(ref, '', content) elif header == 'lines': content = eval(content) if header_d.has_key(content): header_d[content] = header_d[content] + 1 else: header_d[content] = 1 # Check dates for durations if m.has_key('subject') and m.has_key('date'): subject = re.sub(ref, '', m['subject']) # fix century-less dates date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date'])) try: time = mx.DateTime.ARPA.ParseDateTime(date) if first.has_key(subject): if time < first[subject]: first[subject] = time else: first[subject] = time if last.has_key(subject): if time > last[subject]: last[subject] = time else: last[subject] = time except ValueError: # malformed dates raise ValueError pass def end(group, dir, fast = 0): "Analyse the collected data and write to file" global headers, first, last # Calculate durations durations = {} for subject in headers['subject'].items(): try: durations[subject[0]] = last[subject[0]] - first[subject[0]] except KeyError: pass # Sort the headers headers_list = headers.items() if not fast: headers_list.append(('=', {})) headers_list.append(('duration', durations)) headers_list.sort(_headersort) # Start the index file os.makedirs(dir) o = open(dir + '.index' + extension, 'w') o.write((wimp.m['output.index.head'] + '\n') % (group, group)) # Write a file for each header header_number = 1 for header in headers_list: if header[0] == '=': o.write(wimp.m['output.index.separator'] + '\n') continue o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0])) p = open(dir + '.' + str(header_number) + extension, 'w') p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0])) list = header[1].items() if header[0] == 'lines': list.sort() lines_previous = list[0][0] - 1 else: list.sort(_valuesort) value_length = len(str(list[0][1])) for item in list: text = (str(item[0]) + ' ' * 64)[:63] if '<' in text: text = re.sub(lt, '<', text) if '>' in text: text = re.sub(gt, '>', text) if header[0] == 'lines': if lines_previous != item[0] - 1: p.write(wimp.m['output.page.gap'] + '\n') lines_previous = item[0] if header[0] == 'duration': duration = int(item[1].hours) if duration == 0: p.write((wimp.m['output.page.line-0'] + '\n') % (text, (' ' * value_length + str(item[1]))[-value_length:])) else: p.write((wimp.m['output.page.line-d'] + '\n') % (text, (' ' * value_length + str(item[1]))[-value_length:], duration)) else: p.write((wimp.m['output.page.line'] + '\n') % (text, (' ' * value_length + str(item[1]))[-value_length:], item[1])) p.write(wimp.m['output.page.foot'] + '\n') p.close() os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF') header_number = header_number + 1 # Finish the file o.write(wimp.m['output.index.foot'] + '\n') # Close the output file o.close() os.system('SetType ' + dir + '.index' + extension + ' FAF') del headers, first, last def abort(): "Abort analysis" global headers, first, last del headers, first, last def _headersort(item1, item2): "Sort the headers" if item1[0] == 'subject': return -1 elif item2[0] == 'subject': return 1 elif item1[0] == 'from': return -1 elif item2[0] == 'from': return 1 elif item1[0] == 'lines': return -1 elif item2[0] == 'lines': return 1 elif item1[0] == 'duration': return -1 elif item2[0] == 'duration': return 1 elif item1[0] == '=': return -1 elif item2[0] == '=': return 1 elif item1[0] < item2[0]: return -1 return 1 def _valuesort(item1, item2): "Sort a former dictionary" if item1[1] < item2[1]: return 1 elif item1[1] == item2[1]: s1 = string.lower(item1[0]) s2 = string.lower(item2[0]) if s1 > s2: return 1 elif s1 == s2: return 0 return -1