/[james]/archive/newsstats/!NewsStats/py/Analyse.py
ViewVC logotype

Contents of /archive/newsstats/!NewsStats/py/Analyse.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 17 - (show annotations) (download) (as text)
Tue Feb 11 09:59:30 2003 UTC (21 years, 11 months ago) by james
File MIME type: text/x-python
File size: 7502 byte(s)
Initial import.

1 # > !NewsStats.py.Analyse/py
2
3 import os
4 import re
5 import rfc822
6 import string
7 import types
8 import wimp
9 import mx.DateTime
10 import MainW
11 import Newsbase
12
13 # Don't analyse these headers
14 ignore_headers = ('date',
15 'message-id',
16 'nntp-posting-date',
17 'path',
18 'received',
19 'references',
20 'xref',
21 'x-trace')
22 # Some regular expressions
23 lt = re.compile('<')
24 gt = re.compile('>')
25 ref = re.compile('^Re: ', re.IGNORECASE)
26 date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )")
27 date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )")
28 space = re.compile("(\n\s*)|(\t+)")
29
30 extension = '/html'
31
32
33 def group(group, path, fast):
34 "Analyse a group"
35 global articles
36 article_numbers = []
37 MainW.set_status(group, '', '0')
38 articles = Newsbase.ListArts(group)
39 if isinstance(articles, types.StringType):
40 return articles
41 if articles == []:
42 return 0
43 MainW.set_status(group, '?', str(len(articles)))
44 init()
45 if fast:
46 wimp.hourglass_on()
47 h = {}
48 for article in articles:
49 h['from'] = article[2]
50 h['subject'] = article[3]
51 h['date'] = article[5]
52 process_headers(h)
53 end(group, path, 1)
54 else:
55 art_no = 0
56 for article in articles:
57 wimp.handle(wimp.poll(0))
58 success, details = Newsbase.GetArticle(group, article[0])
59 if success:
60 file(details)
61 MainW.set_status(group, str(art_no), str(len(articles)))
62 art_no = art_no + 1
63 else:
64 abort()
65 return details
66 end(group, path)
67 wimp.hourglass_off()
68
69
70 def init():
71 "Get ready to analyse a group"
72 global headers, first, last
73 headers = {}
74 first = {}
75 last = {}
76
77
78 def file(file):
79 "Load an article file"
80
81 # Open the article
82 f = open(file, 'r')
83
84 # Read in the headers
85 m = rfc822.Message(f)
86 process_headers(m)
87
88 # Close file
89 f.close()
90
91
92 def process_headers(m):
93 "Process a dictionary of headers"
94 # Increment the header dictionaries
95 for (header, content) in m.items():
96 if '\n' in content or '\t' in content:
97 content = re.sub(space, ' ', content)
98 if not header in ignore_headers:
99 if headers.has_key(header):
100 header_d = headers[header]
101 else:
102 header_d = {}
103 headers[header] = header_d
104 if header == 'subject':
105 content = re.sub(ref, '', content)
106 elif header == 'lines':
107 content = eval(content)
108 if header_d.has_key(content):
109 header_d[content] = header_d[content] + 1
110 else:
111 header_d[content] = 1
112
113 # Check dates for durations
114 if m.has_key('subject') and m.has_key('date'):
115 subject = re.sub(ref, '', m['subject'])
116 # fix century-less dates
117 date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date']))
118 try:
119 time = mx.DateTime.ARPA.ParseDateTime(date)
120 if first.has_key(subject):
121 if time < first[subject]:
122 first[subject] = time
123 else:
124 first[subject] = time
125 if last.has_key(subject):
126 if time > last[subject]:
127 last[subject] = time
128 else:
129 last[subject] = time
130 except ValueError:
131 # malformed dates raise ValueError
132 pass
133
134
135 def end(group, dir, fast = 0):
136 "Analyse the collected data and write to file"
137 global headers, first, last
138
139 # Calculate durations
140 durations = {}
141 for subject in headers['subject'].items():
142 try:
143 durations[subject[0]] = last[subject[0]] - first[subject[0]]
144 except KeyError:
145 pass
146
147 # Sort the headers
148 headers_list = headers.items()
149 if not fast:
150 headers_list.append(('=', {}))
151 headers_list.append(('duration', durations))
152 headers_list.sort(_headersort)
153
154 # Start the index file
155 os.makedirs(dir)
156 o = open(dir + '.index' + extension, 'w')
157 o.write((wimp.m['output.index.head'] + '\n') % (group, group))
158
159 # Write a file for each header
160 header_number = 1
161 for header in headers_list:
162 if header[0] == '=':
163 o.write(wimp.m['output.index.separator'] + '\n')
164 continue
165
166 o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0]))
167
168 p = open(dir + '.' + str(header_number) + extension, 'w')
169 p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0]))
170
171 list = header[1].items()
172 if header[0] == 'lines':
173 list.sort()
174 lines_previous = list[0][0] - 1
175 else:
176 list.sort(_valuesort)
177 value_length = len(str(list[0][1]))
178 for item in list:
179 text = (str(item[0]) + ' ' * 64)[:63]
180 if '<' in text:
181 text = re.sub(lt, '&lt;', text)
182 if '>' in text:
183 text = re.sub(gt, '&gt;', text)
184 if header[0] == 'lines':
185 if lines_previous != item[0] - 1:
186 p.write(wimp.m['output.page.gap'] + '\n')
187 lines_previous = item[0]
188 if header[0] == 'duration':
189 duration = int(item[1].hours)
190 if duration == 0:
191 p.write((wimp.m['output.page.line-0'] + '\n') %
192 (text, (' ' * value_length + str(item[1]))[-value_length:]))
193 else:
194 p.write((wimp.m['output.page.line-d'] + '\n') %
195 (text, (' ' * value_length + str(item[1]))[-value_length:], duration))
196 else:
197 p.write((wimp.m['output.page.line'] + '\n') %
198 (text, (' ' * value_length + str(item[1]))[-value_length:], item[1]))
199 p.write(wimp.m['output.page.foot'] + '\n')
200
201 p.close()
202 os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF')
203
204 header_number = header_number + 1
205
206 # Finish the file
207 o.write(wimp.m['output.index.foot'] + '\n')
208
209 # Close the output file
210 o.close()
211 os.system('SetType ' + dir + '.index' + extension + ' FAF')
212
213 del headers, first, last
214
215
216 def abort():
217 "Abort analysis"
218 global headers, first, last
219 del headers, first, last
220
221
222 def _headersort(item1, item2):
223 "Sort the headers"
224 if item1[0] == 'subject':
225 return -1
226 elif item2[0] == 'subject':
227 return 1
228 elif item1[0] == 'from':
229 return -1
230 elif item2[0] == 'from':
231 return 1
232 elif item1[0] == 'lines':
233 return -1
234 elif item2[0] == 'lines':
235 return 1
236 elif item1[0] == 'duration':
237 return -1
238 elif item2[0] == 'duration':
239 return 1
240 elif item1[0] == '=':
241 return -1
242 elif item2[0] == '=':
243 return 1
244 elif item1[0] < item2[0]:
245 return -1
246 return 1
247
248
249 def _valuesort(item1, item2):
250 "Sort a former dictionary"
251 if item1[1] < item2[1]:
252 return 1
253 elif item1[1] == item2[1]:
254 s1 = string.lower(item1[0])
255 s2 = string.lower(item2[0])
256 if s1 > s2:
257 return 1
258 elif s1 == s2:
259 return 0
260 return -1

  ViewVC Help
Powered by ViewVC 1.1.26