%21NewsStats/py/Analyse.py

# > !NewsStats.py.Analyse/py

import os
import re
import rfc822
import string
import types
import wimp
import mx.DateTime
import MainW
import Newsbase

# Don't analyse these headers
ignore_headers = ('date',
                  'message-id',
                  'nntp-posting-date',
                  'path',
                  'received',
                  'references',
                  'xref',
                  'x-trace')
# Some regular expressions
lt = re.compile('<')
gt = re.compile('>')
ref = re.compile('^Re: ', re.IGNORECASE)
date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )")
date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )")
space = re.compile("(\n\s*)|(\t+)")

extension = '/html'


def group(group, path, fast):
    "Analyse a group"
    global articles
    article_numbers = []
    MainW.set_status(group, '', '0')
    articles = Newsbase.ListArts(group)
    if isinstance(articles, types.StringType):
        return articles
    if articles == []:
        return 0
    MainW.set_status(group, '?', str(len(articles)))
    init()
    if fast:
        wimp.hourglass_on()
        h = {}
        for article in articles:
            h['from'] = article[2]
            h['subject'] = article[3]
            h['date'] = article[5]
            process_headers(h)
        end(group, path, 1)
    else:
        art_no = 0
        for article in articles:
            wimp.handle(wimp.poll(0))
            success, details = Newsbase.GetArticle(group, article[0])
            if success:
                file(details)
                MainW.set_status(group, str(art_no), str(len(articles)))
                art_no = art_no + 1
            else:
                abort()
                return details
        end(group, path)
    wimp.hourglass_off()


def init():
    "Get ready to analyse a group"
    global headers, first, last
    headers = {}
    first = {}
    last = {}


def file(file):
    "Load an article file"

    # Open the article
    f = open(file, 'r')

    # Read in the headers
    m = rfc822.Message(f)
    process_headers(m)
    
    # Close file
    f.close()


def process_headers(m):
    "Process a dictionary of headers"
    # Increment the header dictionaries    
    for (header, content) in m.items():
        if '\n' in content or '\t' in content:
            content = re.sub(space, ' ', content)
        if not header in ignore_headers:
            if headers.has_key(header):
                header_d = headers[header]
            else:
                header_d = {}
                headers[header] = header_d
            if header == 'subject':
                content = re.sub(ref, '', content)
            elif header == 'lines':
                content = eval(content)
            if header_d.has_key(content):
                header_d[content] = header_d[content] + 1
            else:
                header_d[content] = 1
    
    # Check dates for durations
    if m.has_key('subject') and m.has_key('date'):
        subject = re.sub(ref, '', m['subject'])
        # fix century-less dates
        date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date']))
        try:
            time = mx.DateTime.ARPA.ParseDateTime(date)
            if first.has_key(subject):
                if time < first[subject]:
                    first[subject] = time
            else:
                first[subject] = time
            if last.has_key(subject):
                if time > last[subject]:
                    last[subject] = time
            else:
                last[subject] = time
        except ValueError:
            # malformed dates raise ValueError
            pass


def end(group, dir, fast = 0):
    "Analyse the collected data and write to file"
    global headers, first, last

    # Calculate durations
    durations = {}
    for subject in headers['subject'].items():
        try:
            durations[subject[0]] = last[subject[0]] - first[subject[0]]
        except KeyError:
            pass
    
    # Sort the headers
    headers_list = headers.items()
    if not fast:
        headers_list.append(('=', {}))
    headers_list.append(('duration', durations))
    headers_list.sort(_headersort)
    
    # Start the index file
    os.makedirs(dir)
    o = open(dir + '.index' + extension, 'w')
    o.write((wimp.m['output.index.head'] + '\n') % (group, group))

    # Write a file for each header
    header_number = 1
    for header in headers_list:
        if header[0] == '=':
            o.write(wimp.m['output.index.separator'] + '\n')
            continue
        
        o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0]))

        p = open(dir + '.' + str(header_number) + extension, 'w')
        p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0]))
    
        list = header[1].items()
        if header[0] == 'lines':
            list.sort()
            lines_previous = list[0][0] - 1
        else:
            list.sort(_valuesort)
        value_length = len(str(list[0][1]))
        for item in list:
            text = (str(item[0]) + ' ' * 64)[:63]
            if '<' in text:
                text = re.sub(lt, '&lt;', text)
            if '>' in text:
                text = re.sub(gt, '&gt;', text)
            if header[0] == 'lines':
                if lines_previous != item[0] - 1:
                    p.write(wimp.m['output.page.gap'] + '\n')
                lines_previous = item[0]
            if header[0] == 'duration':
                duration = int(item[1].hours)
                if duration == 0:
                    p.write((wimp.m['output.page.line-0'] + '\n') %
                        (text, (' ' * value_length + str(item[1]))[-value_length:]))
                else:
                    p.write((wimp.m['output.page.line-d'] + '\n') %
                        (text, (' ' * value_length + str(item[1]))[-value_length:], duration))
            else:
                p.write((wimp.m['output.page.line'] + '\n') %
                    (text, (' ' * value_length + str(item[1]))[-value_length:], item[1]))
        p.write(wimp.m['output.page.foot'] + '\n')
        
        p.close()
        os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF')

        header_number = header_number + 1

    # Finish the file
    o.write(wimp.m['output.index.foot'] + '\n')

    # Close the output file
    o.close()
    os.system('SetType ' + dir + '.index' + extension + ' FAF')
    
    del headers, first, last


def abort():
    "Abort analysis"
    global headers, first, last
    del headers, first, last


def _headersort(item1, item2):
    "Sort the headers"
    if item1[0] == 'subject':
        return -1
    elif item2[0] == 'subject':
        return 1
    elif item1[0] == 'from':
        return -1
    elif item2[0] == 'from':
        return 1
    elif item1[0] == 'lines':
        return -1
    elif item2[0] == 'lines':
        return 1
    elif item1[0] == 'duration':
        return -1
    elif item2[0] == 'duration':
        return 1
    elif item1[0] == '=':
        return -1
    elif item2[0] == '=':
        return 1
    elif item1[0] < item2[0]:
        return -1
    return 1


def _valuesort(item1, item2):
    "Sort a former dictionary"
    if item1[1] < item2[1]:
        return 1
    elif item1[1] == item2[1]:
        s1 = string.lower(item1[0])
        s2 = string.lower(item2[0])
        if s1 > s2:
            return 1
        elif s1 == s2:
            return 0
    return -1
1	# > !NewsStats.py.Analyse/py
2
3	import os
4	import re
5	import rfc822
6	import string
7	import types
8	import wimp
9	import mx.DateTime
10	import MainW
11	import Newsbase
12
13	# Don't analyse these headers
14	ignore_headers = ('date',
15	'message-id',
16	'nntp-posting-date',
17	'path',
18	'received',
19	'references',
20	'xref',
21	'x-trace')
22	# Some regular expressions
23	lt = re.compile('<')
24	gt = re.compile('>')
25	ref = re.compile('^Re: ', re.IGNORECASE)
26	date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )")
27	date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )")
28	space = re.compile("(\n\s*)\|(\t+)")
29
30	extension = '/html'
31
32
33	def group(group, path, fast):
34	"Analyse a group"
35	global articles
36	article_numbers = []
37	MainW.set_status(group, '', '0')
38	articles = Newsbase.ListArts(group)
39	if isinstance(articles, types.StringType):
40	return articles
41	if articles == []:
42	return 0
43	MainW.set_status(group, '?', str(len(articles)))
44	init()
45	if fast:
46	wimp.hourglass_on()
47	h = {}
48	for article in articles:
49	h['from'] = article[2]
50	h['subject'] = article[3]
51	h['date'] = article[5]
52	process_headers(h)
53	end(group, path, 1)
54	else:
55	art_no = 0
56	for article in articles:
57	wimp.handle(wimp.poll(0))
58	success, details = Newsbase.GetArticle(group, article[0])
59	if success:
60	file(details)
61	MainW.set_status(group, str(art_no), str(len(articles)))
62	art_no = art_no + 1
63	else:
64	abort()
65	return details
66	end(group, path)
67	wimp.hourglass_off()
68
69
70	def init():
71	"Get ready to analyse a group"
72	global headers, first, last
73	headers = {}
74	first = {}
75	last = {}
76
77
78	def file(file):
79	"Load an article file"
80
81	# Open the article
82	f = open(file, 'r')
83
84	# Read in the headers
85	m = rfc822.Message(f)
86	process_headers(m)
87
88	# Close file
89	f.close()
90
91
92	def process_headers(m):
93	"Process a dictionary of headers"
94	# Increment the header dictionaries
95	for (header, content) in m.items():
96	if '\n' in content or '\t' in content:
97	content = re.sub(space, ' ', content)
98	if not header in ignore_headers:
99	if headers.has_key(header):
100	header_d = headers[header]
101	else:
102	header_d = {}
103	headers[header] = header_d
104	if header == 'subject':
105	content = re.sub(ref, '', content)
106	elif header == 'lines':
107	content = eval(content)
108	if header_d.has_key(content):
109	header_d[content] = header_d[content] + 1
110	else:
111	header_d[content] = 1
112
113	# Check dates for durations
114	if m.has_key('subject') and m.has_key('date'):
115	subject = re.sub(ref, '', m['subject'])
116	# fix century-less dates
117	date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date']))
118	try:
119	time = mx.DateTime.ARPA.ParseDateTime(date)
120	if first.has_key(subject):
121	if time < first[subject]:
122	first[subject] = time
123	else:
124	first[subject] = time
125	if last.has_key(subject):
126	if time > last[subject]:
127	last[subject] = time
128	else:
129	last[subject] = time
130	except ValueError:
131	# malformed dates raise ValueError
132	pass
133
134
135	def end(group, dir, fast = 0):
136	"Analyse the collected data and write to file"
137	global headers, first, last
138
139	# Calculate durations
140	durations = {}
141	for subject in headers['subject'].items():
142	try:
143	durations[subject[0]] = last[subject[0]] - first[subject[0]]
144	except KeyError:
145	pass
146
147	# Sort the headers
148	headers_list = headers.items()
149	if not fast:
150	headers_list.append(('=', {}))
151	headers_list.append(('duration', durations))
152	headers_list.sort(_headersort)
153
154	# Start the index file
155	os.makedirs(dir)
156	o = open(dir + '.index' + extension, 'w')
157	o.write((wimp.m['output.index.head'] + '\n') % (group, group))
158
159	# Write a file for each header
160	header_number = 1
161	for header in headers_list:
162	if header[0] == '=':
163	o.write(wimp.m['output.index.separator'] + '\n')
164	continue
165
166	o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0]))
167
168	p = open(dir + '.' + str(header_number) + extension, 'w')
169	p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0]))
170
171	list = header[1].items()
172	if header[0] == 'lines':
173	list.sort()
174	lines_previous = list[0][0] - 1
175	else:
176	list.sort(_valuesort)
177	value_length = len(str(list[0][1]))
178	for item in list:
179	text = (str(item[0]) + ' ' * 64)[:63]
180	if '<' in text:
181	text = re.sub(lt, '<', text)
182	if '>' in text:
183	text = re.sub(gt, '>', text)
184	if header[0] == 'lines':
185	if lines_previous != item[0] - 1:
186	p.write(wimp.m['output.page.gap'] + '\n')
187	lines_previous = item[0]
188	if header[0] == 'duration':
189	duration = int(item[1].hours)
190	if duration == 0:
191	p.write((wimp.m['output.page.line-0'] + '\n') %
192	(text, (' ' * value_length + str(item[1]))[-value_length:]))
193	else:
194	p.write((wimp.m['output.page.line-d'] + '\n') %
195	(text, (' ' * value_length + str(item[1]))[-value_length:], duration))
196	else:
197	p.write((wimp.m['output.page.line'] + '\n') %
198	(text, (' ' * value_length + str(item[1]))[-value_length:], item[1]))
199	p.write(wimp.m['output.page.foot'] + '\n')
200
201	p.close()
202	os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF')
203
204	header_number = header_number + 1
205
206	# Finish the file
207	o.write(wimp.m['output.index.foot'] + '\n')
208
209	# Close the output file
210	o.close()
211	os.system('SetType ' + dir + '.index' + extension + ' FAF')
212
213	del headers, first, last
214
215
216	def abort():
217	"Abort analysis"
218	global headers, first, last
219	del headers, first, last
220
221
222	def _headersort(item1, item2):
223	"Sort the headers"
224	if item1[0] == 'subject':
225	return -1
226	elif item2[0] == 'subject':
227	return 1
228	elif item1[0] == 'from':
229	return -1
230	elif item2[0] == 'from':
231	return 1
232	elif item1[0] == 'lines':
233	return -1
234	elif item2[0] == 'lines':
235	return 1
236	elif item1[0] == 'duration':
237	return -1
238	elif item2[0] == 'duration':
239	return 1
240	elif item1[0] == '=':
241	return -1
242	elif item2[0] == '=':
243	return 1
244	elif item1[0] < item2[0]:
245	return -1
246	return 1
247
248
249	def _valuesort(item1, item2):
250	"Sort a former dictionary"
251	if item1[1] < item2[1]:
252	return 1
253	elif item1[1] == item2[1]:
254	s1 = string.lower(item1[0])
255	s2 = string.lower(item2[0])
256	if s1 > s2:
257	return 1
258	elif s1 == s2:
259	return 0
260	return -1