%21NewsStats/py/Analyse.py

# > !NewsStats.py.Analyse/py

import os
import re
import rfc822
import string
import types
import wimp
import mx.DateTime
import MainW
import Newsbase

# Don't analyse these headers
ignore_headers = ('date',
                  'message-id',
                  'nntp-posting-date',
                  'path',
                  'received',
                  'references',
                  'xref',
                  'x-trace')
# Some regular expressions
lt = re.compile('<')
gt = re.compile('>')
ref = re.compile('^Re: ', re.IGNORECASE)
date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )")
date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )")
space = re.compile("(\n\s*)|(\t+)")

extension = '/html'


def group(group, path, fast):
    "Analyse a group"
    global articles
    article_numbers = []
    MainW.set_status(group, '', '0')
    articles = Newsbase.ListArts(group)
    if isinstance(articles, types.StringType):
        return articles
    if articles == []:
        return 0
    MainW.set_status(group, '?', str(len(articles)))
    init()
    if fast:
        wimp.hourglass_on()
        h = {}
        for article in articles:
            h['from'] = article[2]
            h['subject'] = article[3]
            h['date'] = article[5]
            process_headers(h)
        end(group, path, 1)
    else:
        art_no = 0
        for article in articles:
            wimp.handle(wimp.poll(0))
            success, details = Newsbase.GetArticle(group, article[0])
            if success:
                file(details)
                MainW.set_status(group, str(art_no), str(len(articles)))
                art_no = art_no + 1
            else:
                abort()
                return details
        end(group, path)
    wimp.hourglass_off()


def init():
    "Get ready to analyse a group"
    global headers, first, last
    headers = {}
    first = {}
    last = {}


def file(file):
    "Load an article file"

    # Open the article
    f = open(file, 'r')

    # Read in the headers
    m = rfc822.Message(f)
    process_headers(m)
    
    # Close file
    f.close()


def process_headers(m):
    "Process a dictionary of headers"
    # Increment the header dictionaries    
    for (header, content) in m.items():
        if '\n' in content or '\t' in content:
            content = re.sub(space, ' ', content)
        if not header in ignore_headers:
            if headers.has_key(header):
                header_d = headers[header]
            else:
                header_d = {}
                headers[header] = header_d
            if header == 'subject':
                content = re.sub(ref, '', content)
            elif header == 'lines':
                content = eval(content)
            if header_d.has_key(content):
                header_d[content] = header_d[content] + 1
            else:
                header_d[content] = 1
    
    # Check dates for durations
    if m.has_key('subject') and m.has_key('date'):
        subject = re.sub(ref, '', m['subject'])
        # fix century-less dates
        date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date']))
        try:
            time = mx.DateTime.ARPA.ParseDateTime(date)
            if first.has_key(subject):
                if time < first[subject]:
                    first[subject] = time
            else:
                first[subject] = time
            if last.has_key(subject):
                if time > last[subject]:
                    last[subject] = time
            else:
                last[subject] = time
        except ValueError:
            # malformed dates raise ValueError
            pass


def end(group, dir, fast = 0):
    "Analyse the collected data and write to file"
    global headers, first, last

    # Calculate durations
    durations = {}
    for subject in headers['subject'].items():
        try:
            durations[subject[0]] = last[subject[0]] - first[subject[0]]
        except KeyError:
            pass
    
    # Sort the headers
    headers_list = headers.items()
    if not fast:
        headers_list.append(('=', {}))
    headers_list.append(('duration', durations))
    headers_list.sort(_headersort)
    
    # Start the index file
    os.makedirs(dir)
    o = open(dir + '.index' + extension, 'w')
    o.write((wimp.m['output.index.head'] + '\n') % (group, group))

    # Write a file for each header
    header_number = 1
    for header in headers_list:
        if header[0] == '=':
            o.write(wimp.m['output.index.separator'] + '\n')
            continue
        
        o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0]))

        p = open(dir + '.' + str(header_number) + extension, 'w')
        p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0]))
    
        list = header[1].items()
        if header[0] == 'lines':
            list.sort()
            lines_previous = list[0][0] - 1
        else:
            list.sort(_valuesort)
        value_length = len(str(list[0][1]))
        for item in list:
            text = (str(item[0]) + ' ' * 64)[:63]
            if '<' in text:
                text = re.sub(lt, '&lt;', text)
            if '>' in text:
                text = re.sub(gt, '&gt;', text)
            if header[0] == 'lines':
                if lines_previous != item[0] - 1:
                    p.write(wimp.m['output.page.gap'] + '\n')
                lines_previous = item[0]
            if header[0] == 'duration':
                duration = int(item[1].hours)
                if duration == 0:
                    p.write((wimp.m['output.page.line-0'] + '\n') %
                        (text, (' ' * value_length + str(item[1]))[-value_length:]))
                else:
                    p.write((wimp.m['output.page.line-d'] + '\n') %
                        (text, (' ' * value_length + str(item[1]))[-value_length:], duration))
            else:
                p.write((wimp.m['output.page.line'] + '\n') %
                    (text, (' ' * value_length + str(item[1]))[-value_length:], item[1]))
        p.write(wimp.m['output.page.foot'] + '\n')
        
        p.close()
        os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF')

        header_number = header_number + 1

    # Finish the file
    o.write(wimp.m['output.index.foot'] + '\n')

    # Close the output file
    o.close()
    os.system('SetType ' + dir + '.index' + extension + ' FAF')
    
    del headers, first, last


def abort():
    "Abort analysis"
    global headers, first, last
    del headers, first, last


def _headersort(item1, item2):
    "Sort the headers"
    if item1[0] == 'subject':
        return -1
    elif item2[0] == 'subject':
        return 1
    elif item1[0] == 'from':
        return -1
    elif item2[0] == 'from':
        return 1
    elif item1[0] == 'lines':
        return -1
    elif item2[0] == 'lines':
        return 1
    elif item1[0] == 'duration':
        return -1
    elif item2[0] == 'duration':
        return 1
    elif item1[0] == '=':
        return -1
    elif item2[0] == '=':
        return 1
    elif item1[0] < item2[0]:
        return -1
    return 1


def _valuesort(item1, item2):
    "Sort a former dictionary"
    if item1[1] < item2[1]:
        return 1
    elif item1[1] == item2[1]:
        s1 = string.lower(item1[0])
        s2 = string.lower(item2[0])
        if s1 > s2:
            return 1
        elif s1 == s2:
            return 0
    return -1
1	james	17	# > !NewsStats.py.Analyse/py
2
3			import os
4			import re
5			import rfc822
6			import string
7			import types
8			import wimp
9			import mx.DateTime
10			import MainW
11			import Newsbase
12
13			# Don't analyse these headers
14			ignore_headers = ('date',
15			'message-id',
16			'nntp-posting-date',
17			'path',
18			'received',
19			'references',
20			'xref',
21			'x-trace')
22			# Some regular expressions
23			lt = re.compile('<')
24			gt = re.compile('>')
25			ref = re.compile('^Re: ', re.IGNORECASE)
26			date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )")
27			date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )")
28			space = re.compile("(\n\s*)\|(\t+)")
29
30			extension = '/html'
31
32
33			def group(group, path, fast):
34			"Analyse a group"
35			global articles
36			article_numbers = []
37			MainW.set_status(group, '', '0')
38			articles = Newsbase.ListArts(group)
39			if isinstance(articles, types.StringType):
40			return articles
41			if articles == []:
42			return 0
43			MainW.set_status(group, '?', str(len(articles)))
44			init()
45			if fast:
46			wimp.hourglass_on()
47			h = {}
48			for article in articles:
49			h['from'] = article[2]
50			h['subject'] = article[3]
51			h['date'] = article[5]
52			process_headers(h)
53			end(group, path, 1)
54			else:
55			art_no = 0
56			for article in articles:
57			wimp.handle(wimp.poll(0))
58			success, details = Newsbase.GetArticle(group, article[0])
59			if success:
60			file(details)
61			MainW.set_status(group, str(art_no), str(len(articles)))
62			art_no = art_no + 1
63			else:
64			abort()
65			return details
66			end(group, path)
67			wimp.hourglass_off()
68
69
70			def init():
71			"Get ready to analyse a group"
72			global headers, first, last
73			headers = {}
74			first = {}
75			last = {}
76
77
78			def file(file):
79			"Load an article file"
80
81			# Open the article
82			f = open(file, 'r')
83
84			# Read in the headers
85			m = rfc822.Message(f)
86			process_headers(m)
87
88			# Close file
89			f.close()
90
91
92			def process_headers(m):
93			"Process a dictionary of headers"
94			# Increment the header dictionaries
95			for (header, content) in m.items():
96			if '\n' in content or '\t' in content:
97			content = re.sub(space, ' ', content)
98			if not header in ignore_headers:
99			if headers.has_key(header):
100			header_d = headers[header]
101			else:
102			header_d = {}
103			headers[header] = header_d
104			if header == 'subject':
105			content = re.sub(ref, '', content)
106			elif header == 'lines':
107			content = eval(content)
108			if header_d.has_key(content):
109			header_d[content] = header_d[content] + 1
110			else:
111			header_d[content] = 1
112
113			# Check dates for durations
114			if m.has_key('subject') and m.has_key('date'):
115			subject = re.sub(ref, '', m['subject'])
116			# fix century-less dates
117			date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date']))
118			try:
119			time = mx.DateTime.ARPA.ParseDateTime(date)
120			if first.has_key(subject):
121			if time < first[subject]:
122			first[subject] = time
123			else:
124			first[subject] = time
125			if last.has_key(subject):
126			if time > last[subject]:
127			last[subject] = time
128			else:
129			last[subject] = time
130			except ValueError:
131			# malformed dates raise ValueError
132			pass
133
134
135			def end(group, dir, fast = 0):
136			"Analyse the collected data and write to file"
137			global headers, first, last
138
139			# Calculate durations
140			durations = {}
141			for subject in headers['subject'].items():
142			try:
143			durations[subject[0]] = last[subject[0]] - first[subject[0]]
144			except KeyError:
145			pass
146
147			# Sort the headers
148			headers_list = headers.items()
149			if not fast:
150			headers_list.append(('=', {}))
151			headers_list.append(('duration', durations))
152			headers_list.sort(_headersort)
153
154			# Start the index file
155			os.makedirs(dir)
156			o = open(dir + '.index' + extension, 'w')
157			o.write((wimp.m['output.index.head'] + '\n') % (group, group))
158
159			# Write a file for each header
160			header_number = 1
161			for header in headers_list:
162			if header[0] == '=':
163			o.write(wimp.m['output.index.separator'] + '\n')
164			continue
165
166			o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0]))
167
168			p = open(dir + '.' + str(header_number) + extension, 'w')
169			p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0]))
170
171			list = header[1].items()
172			if header[0] == 'lines':
173			list.sort()
174			lines_previous = list[0][0] - 1
175			else:
176			list.sort(_valuesort)
177			value_length = len(str(list[0][1]))
178			for item in list:
179			text = (str(item[0]) + ' ' * 64)[:63]
180			if '<' in text:
181			text = re.sub(lt, '<', text)
182			if '>' in text:
183			text = re.sub(gt, '>', text)
184			if header[0] == 'lines':
185			if lines_previous != item[0] - 1:
186			p.write(wimp.m['output.page.gap'] + '\n')
187			lines_previous = item[0]
188			if header[0] == 'duration':
189			duration = int(item[1].hours)
190			if duration == 0:
191			p.write((wimp.m['output.page.line-0'] + '\n') %
192			(text, (' ' * value_length + str(item[1]))[-value_length:]))
193			else:
194			p.write((wimp.m['output.page.line-d'] + '\n') %
195			(text, (' ' * value_length + str(item[1]))[-value_length:], duration))
196			else:
197			p.write((wimp.m['output.page.line'] + '\n') %
198			(text, (' ' * value_length + str(item[1]))[-value_length:], item[1]))
199			p.write(wimp.m['output.page.foot'] + '\n')
200
201			p.close()
202			os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF')
203
204			header_number = header_number + 1
205
206			# Finish the file
207			o.write(wimp.m['output.index.foot'] + '\n')
208
209			# Close the output file
210			o.close()
211			os.system('SetType ' + dir + '.index' + extension + ' FAF')
212
213			del headers, first, last
214
215
216			def abort():
217			"Abort analysis"
218			global headers, first, last
219			del headers, first, last
220
221
222			def _headersort(item1, item2):
223			"Sort the headers"
224			if item1[0] == 'subject':
225			return -1
226			elif item2[0] == 'subject':
227			return 1
228			elif item1[0] == 'from':
229			return -1
230			elif item2[0] == 'from':
231			return 1
232			elif item1[0] == 'lines':
233			return -1
234			elif item2[0] == 'lines':
235			return 1
236			elif item1[0] == 'duration':
237			return -1
238			elif item2[0] == 'duration':
239			return 1
240			elif item1[0] == '=':
241			return -1
242			elif item2[0] == '=':
243			return 1
244			elif item1[0] < item2[0]:
245			return -1
246			return 1
247
248
249			def _valuesort(item1, item2):
250			"Sort a former dictionary"
251			if item1[1] < item2[1]:
252			return 1
253			elif item1[1] == item2[1]:
254			s1 = string.lower(item1[0])
255			s2 = string.lower(item2[0])
256			if s1 > s2:
257			return 1
258			elif s1 == s2:
259			return 0
260			return -1