1 |
james |
17 |
# > !NewsStats.py.Analyse/py |
2 |
|
|
|
3 |
|
|
import os |
4 |
|
|
import re |
5 |
|
|
import rfc822 |
6 |
|
|
import string |
7 |
|
|
import types |
8 |
|
|
import wimp |
9 |
|
|
import mx.DateTime |
10 |
|
|
import MainW |
11 |
|
|
import Newsbase |
12 |
|
|
|
13 |
|
|
# Don't analyse these headers |
14 |
|
|
ignore_headers = ('date', |
15 |
|
|
'message-id', |
16 |
|
|
'nntp-posting-date', |
17 |
|
|
'path', |
18 |
|
|
'received', |
19 |
|
|
'references', |
20 |
|
|
'xref', |
21 |
|
|
'x-trace') |
22 |
|
|
# Some regular expressions |
23 |
|
|
lt = re.compile('<') |
24 |
|
|
gt = re.compile('>') |
25 |
|
|
ref = re.compile('^Re: ', re.IGNORECASE) |
26 |
|
|
date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )") |
27 |
|
|
date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )") |
28 |
|
|
space = re.compile("(\n\s*)|(\t+)") |
29 |
|
|
|
30 |
|
|
extension = '/html' |
31 |
|
|
|
32 |
|
|
|
33 |
|
|
def group(group, path, fast): |
34 |
|
|
"Analyse a group" |
35 |
|
|
global articles |
36 |
|
|
article_numbers = [] |
37 |
|
|
MainW.set_status(group, '', '0') |
38 |
|
|
articles = Newsbase.ListArts(group) |
39 |
|
|
if isinstance(articles, types.StringType): |
40 |
|
|
return articles |
41 |
|
|
if articles == []: |
42 |
|
|
return 0 |
43 |
|
|
MainW.set_status(group, '?', str(len(articles))) |
44 |
|
|
init() |
45 |
|
|
if fast: |
46 |
|
|
wimp.hourglass_on() |
47 |
|
|
h = {} |
48 |
|
|
for article in articles: |
49 |
|
|
h['from'] = article[2] |
50 |
|
|
h['subject'] = article[3] |
51 |
|
|
h['date'] = article[5] |
52 |
|
|
process_headers(h) |
53 |
|
|
end(group, path, 1) |
54 |
|
|
else: |
55 |
|
|
art_no = 0 |
56 |
|
|
for article in articles: |
57 |
|
|
wimp.handle(wimp.poll(0)) |
58 |
|
|
success, details = Newsbase.GetArticle(group, article[0]) |
59 |
|
|
if success: |
60 |
|
|
file(details) |
61 |
|
|
MainW.set_status(group, str(art_no), str(len(articles))) |
62 |
|
|
art_no = art_no + 1 |
63 |
|
|
else: |
64 |
|
|
abort() |
65 |
|
|
return details |
66 |
|
|
end(group, path) |
67 |
|
|
wimp.hourglass_off() |
68 |
|
|
|
69 |
|
|
|
70 |
|
|
def init(): |
71 |
|
|
"Get ready to analyse a group" |
72 |
|
|
global headers, first, last |
73 |
|
|
headers = {} |
74 |
|
|
first = {} |
75 |
|
|
last = {} |
76 |
|
|
|
77 |
|
|
|
78 |
|
|
def file(file): |
79 |
|
|
"Load an article file" |
80 |
|
|
|
81 |
|
|
# Open the article |
82 |
|
|
f = open(file, 'r') |
83 |
|
|
|
84 |
|
|
# Read in the headers |
85 |
|
|
m = rfc822.Message(f) |
86 |
|
|
process_headers(m) |
87 |
|
|
|
88 |
|
|
# Close file |
89 |
|
|
f.close() |
90 |
|
|
|
91 |
|
|
|
92 |
|
|
def process_headers(m): |
93 |
|
|
"Process a dictionary of headers" |
94 |
|
|
# Increment the header dictionaries |
95 |
|
|
for (header, content) in m.items(): |
96 |
|
|
if '\n' in content or '\t' in content: |
97 |
|
|
content = re.sub(space, ' ', content) |
98 |
|
|
if not header in ignore_headers: |
99 |
|
|
if headers.has_key(header): |
100 |
|
|
header_d = headers[header] |
101 |
|
|
else: |
102 |
|
|
header_d = {} |
103 |
|
|
headers[header] = header_d |
104 |
|
|
if header == 'subject': |
105 |
|
|
content = re.sub(ref, '', content) |
106 |
|
|
elif header == 'lines': |
107 |
|
|
content = eval(content) |
108 |
|
|
if header_d.has_key(content): |
109 |
|
|
header_d[content] = header_d[content] + 1 |
110 |
|
|
else: |
111 |
|
|
header_d[content] = 1 |
112 |
|
|
|
113 |
|
|
# Check dates for durations |
114 |
|
|
if m.has_key('subject') and m.has_key('date'): |
115 |
|
|
subject = re.sub(ref, '', m['subject']) |
116 |
|
|
# fix century-less dates |
117 |
|
|
date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date'])) |
118 |
|
|
try: |
119 |
|
|
time = mx.DateTime.ARPA.ParseDateTime(date) |
120 |
|
|
if first.has_key(subject): |
121 |
|
|
if time < first[subject]: |
122 |
|
|
first[subject] = time |
123 |
|
|
else: |
124 |
|
|
first[subject] = time |
125 |
|
|
if last.has_key(subject): |
126 |
|
|
if time > last[subject]: |
127 |
|
|
last[subject] = time |
128 |
|
|
else: |
129 |
|
|
last[subject] = time |
130 |
|
|
except ValueError: |
131 |
|
|
# malformed dates raise ValueError |
132 |
|
|
pass |
133 |
|
|
|
134 |
|
|
|
135 |
|
|
def end(group, dir, fast = 0): |
136 |
|
|
"Analyse the collected data and write to file" |
137 |
|
|
global headers, first, last |
138 |
|
|
|
139 |
|
|
# Calculate durations |
140 |
|
|
durations = {} |
141 |
|
|
for subject in headers['subject'].items(): |
142 |
|
|
try: |
143 |
|
|
durations[subject[0]] = last[subject[0]] - first[subject[0]] |
144 |
|
|
except KeyError: |
145 |
|
|
pass |
146 |
|
|
|
147 |
|
|
# Sort the headers |
148 |
|
|
headers_list = headers.items() |
149 |
|
|
if not fast: |
150 |
|
|
headers_list.append(('=', {})) |
151 |
|
|
headers_list.append(('duration', durations)) |
152 |
|
|
headers_list.sort(_headersort) |
153 |
|
|
|
154 |
|
|
# Start the index file |
155 |
|
|
os.makedirs(dir) |
156 |
|
|
o = open(dir + '.index' + extension, 'w') |
157 |
|
|
o.write((wimp.m['output.index.head'] + '\n') % (group, group)) |
158 |
|
|
|
159 |
|
|
# Write a file for each header |
160 |
|
|
header_number = 1 |
161 |
|
|
for header in headers_list: |
162 |
|
|
if header[0] == '=': |
163 |
|
|
o.write(wimp.m['output.index.separator'] + '\n') |
164 |
|
|
continue |
165 |
|
|
|
166 |
|
|
o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0])) |
167 |
|
|
|
168 |
|
|
p = open(dir + '.' + str(header_number) + extension, 'w') |
169 |
|
|
p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0])) |
170 |
|
|
|
171 |
|
|
list = header[1].items() |
172 |
|
|
if header[0] == 'lines': |
173 |
|
|
list.sort() |
174 |
|
|
lines_previous = list[0][0] - 1 |
175 |
|
|
else: |
176 |
|
|
list.sort(_valuesort) |
177 |
|
|
value_length = len(str(list[0][1])) |
178 |
|
|
for item in list: |
179 |
|
|
text = (str(item[0]) + ' ' * 64)[:63] |
180 |
|
|
if '<' in text: |
181 |
|
|
text = re.sub(lt, '<', text) |
182 |
|
|
if '>' in text: |
183 |
|
|
text = re.sub(gt, '>', text) |
184 |
|
|
if header[0] == 'lines': |
185 |
|
|
if lines_previous != item[0] - 1: |
186 |
|
|
p.write(wimp.m['output.page.gap'] + '\n') |
187 |
|
|
lines_previous = item[0] |
188 |
|
|
if header[0] == 'duration': |
189 |
|
|
duration = int(item[1].hours) |
190 |
|
|
if duration == 0: |
191 |
|
|
p.write((wimp.m['output.page.line-0'] + '\n') % |
192 |
|
|
(text, (' ' * value_length + str(item[1]))[-value_length:])) |
193 |
|
|
else: |
194 |
|
|
p.write((wimp.m['output.page.line-d'] + '\n') % |
195 |
|
|
(text, (' ' * value_length + str(item[1]))[-value_length:], duration)) |
196 |
|
|
else: |
197 |
|
|
p.write((wimp.m['output.page.line'] + '\n') % |
198 |
|
|
(text, (' ' * value_length + str(item[1]))[-value_length:], item[1])) |
199 |
|
|
p.write(wimp.m['output.page.foot'] + '\n') |
200 |
|
|
|
201 |
|
|
p.close() |
202 |
|
|
os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF') |
203 |
|
|
|
204 |
|
|
header_number = header_number + 1 |
205 |
|
|
|
206 |
|
|
# Finish the file |
207 |
|
|
o.write(wimp.m['output.index.foot'] + '\n') |
208 |
|
|
|
209 |
|
|
# Close the output file |
210 |
|
|
o.close() |
211 |
|
|
os.system('SetType ' + dir + '.index' + extension + ' FAF') |
212 |
|
|
|
213 |
|
|
del headers, first, last |
214 |
|
|
|
215 |
|
|
|
216 |
|
|
def abort(): |
217 |
|
|
"Abort analysis" |
218 |
|
|
global headers, first, last |
219 |
|
|
del headers, first, last |
220 |
|
|
|
221 |
|
|
|
222 |
|
|
def _headersort(item1, item2): |
223 |
|
|
"Sort the headers" |
224 |
|
|
if item1[0] == 'subject': |
225 |
|
|
return -1 |
226 |
|
|
elif item2[0] == 'subject': |
227 |
|
|
return 1 |
228 |
|
|
elif item1[0] == 'from': |
229 |
|
|
return -1 |
230 |
|
|
elif item2[0] == 'from': |
231 |
|
|
return 1 |
232 |
|
|
elif item1[0] == 'lines': |
233 |
|
|
return -1 |
234 |
|
|
elif item2[0] == 'lines': |
235 |
|
|
return 1 |
236 |
|
|
elif item1[0] == 'duration': |
237 |
|
|
return -1 |
238 |
|
|
elif item2[0] == 'duration': |
239 |
|
|
return 1 |
240 |
|
|
elif item1[0] == '=': |
241 |
|
|
return -1 |
242 |
|
|
elif item2[0] == '=': |
243 |
|
|
return 1 |
244 |
|
|
elif item1[0] < item2[0]: |
245 |
|
|
return -1 |
246 |
|
|
return 1 |
247 |
|
|
|
248 |
|
|
|
249 |
|
|
def _valuesort(item1, item2): |
250 |
|
|
"Sort a former dictionary" |
251 |
|
|
if item1[1] < item2[1]: |
252 |
|
|
return 1 |
253 |
|
|
elif item1[1] == item2[1]: |
254 |
|
|
s1 = string.lower(item1[0]) |
255 |
|
|
s2 = string.lower(item2[0]) |
256 |
|
|
if s1 > s2: |
257 |
|
|
return 1 |
258 |
|
|
elif s1 == s2: |
259 |
|
|
return 0 |
260 |
|
|
return -1 |