/[james]/archive/newsstats/!NewsStats/py/Analyse.py
ViewVC logotype

Annotation of /archive/newsstats/!NewsStats/py/Analyse.py

Parent Directory Parent Directory | Revision Log Revision Log


Revision 17 - (hide annotations) (download) (as text)
Tue Feb 11 09:59:30 2003 UTC (21 years, 10 months ago) by james
File MIME type: text/x-python
File size: 7502 byte(s)
Initial import.

1 james 17 # > !NewsStats.py.Analyse/py
2    
3     import os
4     import re
5     import rfc822
6     import string
7     import types
8     import wimp
9     import mx.DateTime
10     import MainW
11     import Newsbase
12    
13     # Don't analyse these headers
14     ignore_headers = ('date',
15     'message-id',
16     'nntp-posting-date',
17     'path',
18     'received',
19     'references',
20     'xref',
21     'x-trace')
22     # Some regular expressions
23     lt = re.compile('<')
24     gt = re.compile('>')
25     ref = re.compile('^Re: ', re.IGNORECASE)
26     date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )")
27     date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )")
28     space = re.compile("(\n\s*)|(\t+)")
29    
30     extension = '/html'
31    
32    
33     def group(group, path, fast):
34     "Analyse a group"
35     global articles
36     article_numbers = []
37     MainW.set_status(group, '', '0')
38     articles = Newsbase.ListArts(group)
39     if isinstance(articles, types.StringType):
40     return articles
41     if articles == []:
42     return 0
43     MainW.set_status(group, '?', str(len(articles)))
44     init()
45     if fast:
46     wimp.hourglass_on()
47     h = {}
48     for article in articles:
49     h['from'] = article[2]
50     h['subject'] = article[3]
51     h['date'] = article[5]
52     process_headers(h)
53     end(group, path, 1)
54     else:
55     art_no = 0
56     for article in articles:
57     wimp.handle(wimp.poll(0))
58     success, details = Newsbase.GetArticle(group, article[0])
59     if success:
60     file(details)
61     MainW.set_status(group, str(art_no), str(len(articles)))
62     art_no = art_no + 1
63     else:
64     abort()
65     return details
66     end(group, path)
67     wimp.hourglass_off()
68    
69    
70     def init():
71     "Get ready to analyse a group"
72     global headers, first, last
73     headers = {}
74     first = {}
75     last = {}
76    
77    
78     def file(file):
79     "Load an article file"
80    
81     # Open the article
82     f = open(file, 'r')
83    
84     # Read in the headers
85     m = rfc822.Message(f)
86     process_headers(m)
87    
88     # Close file
89     f.close()
90    
91    
92     def process_headers(m):
93     "Process a dictionary of headers"
94     # Increment the header dictionaries
95     for (header, content) in m.items():
96     if '\n' in content or '\t' in content:
97     content = re.sub(space, ' ', content)
98     if not header in ignore_headers:
99     if headers.has_key(header):
100     header_d = headers[header]
101     else:
102     header_d = {}
103     headers[header] = header_d
104     if header == 'subject':
105     content = re.sub(ref, '', content)
106     elif header == 'lines':
107     content = eval(content)
108     if header_d.has_key(content):
109     header_d[content] = header_d[content] + 1
110     else:
111     header_d[content] = 1
112    
113     # Check dates for durations
114     if m.has_key('subject') and m.has_key('date'):
115     subject = re.sub(ref, '', m['subject'])
116     # fix century-less dates
117     date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date']))
118     try:
119     time = mx.DateTime.ARPA.ParseDateTime(date)
120     if first.has_key(subject):
121     if time < first[subject]:
122     first[subject] = time
123     else:
124     first[subject] = time
125     if last.has_key(subject):
126     if time > last[subject]:
127     last[subject] = time
128     else:
129     last[subject] = time
130     except ValueError:
131     # malformed dates raise ValueError
132     pass
133    
134    
135     def end(group, dir, fast = 0):
136     "Analyse the collected data and write to file"
137     global headers, first, last
138    
139     # Calculate durations
140     durations = {}
141     for subject in headers['subject'].items():
142     try:
143     durations[subject[0]] = last[subject[0]] - first[subject[0]]
144     except KeyError:
145     pass
146    
147     # Sort the headers
148     headers_list = headers.items()
149     if not fast:
150     headers_list.append(('=', {}))
151     headers_list.append(('duration', durations))
152     headers_list.sort(_headersort)
153    
154     # Start the index file
155     os.makedirs(dir)
156     o = open(dir + '.index' + extension, 'w')
157     o.write((wimp.m['output.index.head'] + '\n') % (group, group))
158    
159     # Write a file for each header
160     header_number = 1
161     for header in headers_list:
162     if header[0] == '=':
163     o.write(wimp.m['output.index.separator'] + '\n')
164     continue
165    
166     o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0]))
167    
168     p = open(dir + '.' + str(header_number) + extension, 'w')
169     p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0]))
170    
171     list = header[1].items()
172     if header[0] == 'lines':
173     list.sort()
174     lines_previous = list[0][0] - 1
175     else:
176     list.sort(_valuesort)
177     value_length = len(str(list[0][1]))
178     for item in list:
179     text = (str(item[0]) + ' ' * 64)[:63]
180     if '<' in text:
181     text = re.sub(lt, '&lt;', text)
182     if '>' in text:
183     text = re.sub(gt, '&gt;', text)
184     if header[0] == 'lines':
185     if lines_previous != item[0] - 1:
186     p.write(wimp.m['output.page.gap'] + '\n')
187     lines_previous = item[0]
188     if header[0] == 'duration':
189     duration = int(item[1].hours)
190     if duration == 0:
191     p.write((wimp.m['output.page.line-0'] + '\n') %
192     (text, (' ' * value_length + str(item[1]))[-value_length:]))
193     else:
194     p.write((wimp.m['output.page.line-d'] + '\n') %
195     (text, (' ' * value_length + str(item[1]))[-value_length:], duration))
196     else:
197     p.write((wimp.m['output.page.line'] + '\n') %
198     (text, (' ' * value_length + str(item[1]))[-value_length:], item[1]))
199     p.write(wimp.m['output.page.foot'] + '\n')
200    
201     p.close()
202     os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF')
203    
204     header_number = header_number + 1
205    
206     # Finish the file
207     o.write(wimp.m['output.index.foot'] + '\n')
208    
209     # Close the output file
210     o.close()
211     os.system('SetType ' + dir + '.index' + extension + ' FAF')
212    
213     del headers, first, last
214    
215    
216     def abort():
217     "Abort analysis"
218     global headers, first, last
219     del headers, first, last
220    
221    
222     def _headersort(item1, item2):
223     "Sort the headers"
224     if item1[0] == 'subject':
225     return -1
226     elif item2[0] == 'subject':
227     return 1
228     elif item1[0] == 'from':
229     return -1
230     elif item2[0] == 'from':
231     return 1
232     elif item1[0] == 'lines':
233     return -1
234     elif item2[0] == 'lines':
235     return 1
236     elif item1[0] == 'duration':
237     return -1
238     elif item2[0] == 'duration':
239     return 1
240     elif item1[0] == '=':
241     return -1
242     elif item2[0] == '=':
243     return 1
244     elif item1[0] < item2[0]:
245     return -1
246     return 1
247    
248    
249     def _valuesort(item1, item2):
250     "Sort a former dictionary"
251     if item1[1] < item2[1]:
252     return 1
253     elif item1[1] == item2[1]:
254     s1 = string.lower(item1[0])
255     s2 = string.lower(item2[0])
256     if s1 > s2:
257     return 1
258     elif s1 == s2:
259     return 0
260     return -1

  ViewVC Help
Powered by ViewVC 1.1.26