1 |
# > !NewsStats.py.Analyse/py |
2 |
|
3 |
import os |
4 |
import re |
5 |
import rfc822 |
6 |
import string |
7 |
import types |
8 |
import wimp |
9 |
import mx.DateTime |
10 |
import MainW |
11 |
import Newsbase |
12 |
|
13 |
# Don't analyse these headers |
14 |
ignore_headers = ('date', |
15 |
'message-id', |
16 |
'nntp-posting-date', |
17 |
'path', |
18 |
'received', |
19 |
'references', |
20 |
'xref', |
21 |
'x-trace') |
22 |
# Some regular expressions |
23 |
lt = re.compile('<') |
24 |
gt = re.compile('>') |
25 |
ref = re.compile('^Re: ', re.IGNORECASE) |
26 |
date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )") |
27 |
date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )") |
28 |
space = re.compile("(\n\s*)|(\t+)") |
29 |
|
30 |
extension = '/html' |
31 |
|
32 |
|
33 |
def group(group, path, fast): |
34 |
"Analyse a group" |
35 |
global articles |
36 |
article_numbers = [] |
37 |
MainW.set_status(group, '', '0') |
38 |
articles = Newsbase.ListArts(group) |
39 |
if isinstance(articles, types.StringType): |
40 |
return articles |
41 |
if articles == []: |
42 |
return 0 |
43 |
MainW.set_status(group, '?', str(len(articles))) |
44 |
init() |
45 |
if fast: |
46 |
wimp.hourglass_on() |
47 |
h = {} |
48 |
for article in articles: |
49 |
h['from'] = article[2] |
50 |
h['subject'] = article[3] |
51 |
h['date'] = article[5] |
52 |
process_headers(h) |
53 |
end(group, path, 1) |
54 |
else: |
55 |
art_no = 0 |
56 |
for article in articles: |
57 |
wimp.handle(wimp.poll(0)) |
58 |
success, details = Newsbase.GetArticle(group, article[0]) |
59 |
if success: |
60 |
file(details) |
61 |
MainW.set_status(group, str(art_no), str(len(articles))) |
62 |
art_no = art_no + 1 |
63 |
else: |
64 |
abort() |
65 |
return details |
66 |
end(group, path) |
67 |
wimp.hourglass_off() |
68 |
|
69 |
|
70 |
def init(): |
71 |
"Get ready to analyse a group" |
72 |
global headers, first, last |
73 |
headers = {} |
74 |
first = {} |
75 |
last = {} |
76 |
|
77 |
|
78 |
def file(file): |
79 |
"Load an article file" |
80 |
|
81 |
# Open the article |
82 |
f = open(file, 'r') |
83 |
|
84 |
# Read in the headers |
85 |
m = rfc822.Message(f) |
86 |
process_headers(m) |
87 |
|
88 |
# Close file |
89 |
f.close() |
90 |
|
91 |
|
92 |
def process_headers(m): |
93 |
"Process a dictionary of headers" |
94 |
# Increment the header dictionaries |
95 |
for (header, content) in m.items(): |
96 |
if '\n' in content or '\t' in content: |
97 |
content = re.sub(space, ' ', content) |
98 |
if not header in ignore_headers: |
99 |
if headers.has_key(header): |
100 |
header_d = headers[header] |
101 |
else: |
102 |
header_d = {} |
103 |
headers[header] = header_d |
104 |
if header == 'subject': |
105 |
content = re.sub(ref, '', content) |
106 |
elif header == 'lines': |
107 |
content = eval(content) |
108 |
if header_d.has_key(content): |
109 |
header_d[content] = header_d[content] + 1 |
110 |
else: |
111 |
header_d[content] = 1 |
112 |
|
113 |
# Check dates for durations |
114 |
if m.has_key('subject') and m.has_key('date'): |
115 |
subject = re.sub(ref, '', m['subject']) |
116 |
# fix century-less dates |
117 |
date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date'])) |
118 |
try: |
119 |
time = mx.DateTime.ARPA.ParseDateTime(date) |
120 |
if first.has_key(subject): |
121 |
if time < first[subject]: |
122 |
first[subject] = time |
123 |
else: |
124 |
first[subject] = time |
125 |
if last.has_key(subject): |
126 |
if time > last[subject]: |
127 |
last[subject] = time |
128 |
else: |
129 |
last[subject] = time |
130 |
except ValueError: |
131 |
# malformed dates raise ValueError |
132 |
pass |
133 |
|
134 |
|
135 |
def end(group, dir, fast = 0): |
136 |
"Analyse the collected data and write to file" |
137 |
global headers, first, last |
138 |
|
139 |
# Calculate durations |
140 |
durations = {} |
141 |
for subject in headers['subject'].items(): |
142 |
try: |
143 |
durations[subject[0]] = last[subject[0]] - first[subject[0]] |
144 |
except KeyError: |
145 |
pass |
146 |
|
147 |
# Sort the headers |
148 |
headers_list = headers.items() |
149 |
if not fast: |
150 |
headers_list.append(('=', {})) |
151 |
headers_list.append(('duration', durations)) |
152 |
headers_list.sort(_headersort) |
153 |
|
154 |
# Start the index file |
155 |
os.makedirs(dir) |
156 |
o = open(dir + '.index' + extension, 'w') |
157 |
o.write((wimp.m['output.index.head'] + '\n') % (group, group)) |
158 |
|
159 |
# Write a file for each header |
160 |
header_number = 1 |
161 |
for header in headers_list: |
162 |
if header[0] == '=': |
163 |
o.write(wimp.m['output.index.separator'] + '\n') |
164 |
continue |
165 |
|
166 |
o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0])) |
167 |
|
168 |
p = open(dir + '.' + str(header_number) + extension, 'w') |
169 |
p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0])) |
170 |
|
171 |
list = header[1].items() |
172 |
if header[0] == 'lines': |
173 |
list.sort() |
174 |
lines_previous = list[0][0] - 1 |
175 |
else: |
176 |
list.sort(_valuesort) |
177 |
value_length = len(str(list[0][1])) |
178 |
for item in list: |
179 |
text = (str(item[0]) + ' ' * 64)[:63] |
180 |
if '<' in text: |
181 |
text = re.sub(lt, '<', text) |
182 |
if '>' in text: |
183 |
text = re.sub(gt, '>', text) |
184 |
if header[0] == 'lines': |
185 |
if lines_previous != item[0] - 1: |
186 |
p.write(wimp.m['output.page.gap'] + '\n') |
187 |
lines_previous = item[0] |
188 |
if header[0] == 'duration': |
189 |
duration = int(item[1].hours) |
190 |
if duration == 0: |
191 |
p.write((wimp.m['output.page.line-0'] + '\n') % |
192 |
(text, (' ' * value_length + str(item[1]))[-value_length:])) |
193 |
else: |
194 |
p.write((wimp.m['output.page.line-d'] + '\n') % |
195 |
(text, (' ' * value_length + str(item[1]))[-value_length:], duration)) |
196 |
else: |
197 |
p.write((wimp.m['output.page.line'] + '\n') % |
198 |
(text, (' ' * value_length + str(item[1]))[-value_length:], item[1])) |
199 |
p.write(wimp.m['output.page.foot'] + '\n') |
200 |
|
201 |
p.close() |
202 |
os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF') |
203 |
|
204 |
header_number = header_number + 1 |
205 |
|
206 |
# Finish the file |
207 |
o.write(wimp.m['output.index.foot'] + '\n') |
208 |
|
209 |
# Close the output file |
210 |
o.close() |
211 |
os.system('SetType ' + dir + '.index' + extension + ' FAF') |
212 |
|
213 |
del headers, first, last |
214 |
|
215 |
|
216 |
def abort(): |
217 |
"Abort analysis" |
218 |
global headers, first, last |
219 |
del headers, first, last |
220 |
|
221 |
|
222 |
def _headersort(item1, item2): |
223 |
"Sort the headers" |
224 |
if item1[0] == 'subject': |
225 |
return -1 |
226 |
elif item2[0] == 'subject': |
227 |
return 1 |
228 |
elif item1[0] == 'from': |
229 |
return -1 |
230 |
elif item2[0] == 'from': |
231 |
return 1 |
232 |
elif item1[0] == 'lines': |
233 |
return -1 |
234 |
elif item2[0] == 'lines': |
235 |
return 1 |
236 |
elif item1[0] == 'duration': |
237 |
return -1 |
238 |
elif item2[0] == 'duration': |
239 |
return 1 |
240 |
elif item1[0] == '=': |
241 |
return -1 |
242 |
elif item2[0] == '=': |
243 |
return 1 |
244 |
elif item1[0] < item2[0]: |
245 |
return -1 |
246 |
return 1 |
247 |
|
248 |
|
249 |
def _valuesort(item1, item2): |
250 |
"Sort a former dictionary" |
251 |
if item1[1] < item2[1]: |
252 |
return 1 |
253 |
elif item1[1] == item2[1]: |
254 |
s1 = string.lower(item1[0]) |
255 |
s2 = string.lower(item2[0]) |
256 |
if s1 > s2: |
257 |
return 1 |
258 |
elif s1 == s2: |
259 |
return 0 |
260 |
return -1 |