| 1 |
# > !NewsStats.py.Analyse/py |
| 2 |
|
| 3 |
import os |
| 4 |
import re |
| 5 |
import rfc822 |
| 6 |
import string |
| 7 |
import types |
| 8 |
import wimp |
| 9 |
import mx.DateTime |
| 10 |
import MainW |
| 11 |
import Newsbase |
| 12 |
|
| 13 |
# Don't analyse these headers |
| 14 |
ignore_headers = ('date', |
| 15 |
'message-id', |
| 16 |
'nntp-posting-date', |
| 17 |
'path', |
| 18 |
'received', |
| 19 |
'references', |
| 20 |
'xref', |
| 21 |
'x-trace') |
| 22 |
# Some regular expressions |
| 23 |
lt = re.compile('<') |
| 24 |
gt = re.compile('>') |
| 25 |
ref = re.compile('^Re: ', re.IGNORECASE) |
| 26 |
date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )") |
| 27 |
date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )") |
| 28 |
space = re.compile("(\n\s*)|(\t+)") |
| 29 |
|
| 30 |
extension = '/html' |
| 31 |
|
| 32 |
|
| 33 |
def group(group, path, fast): |
| 34 |
"Analyse a group" |
| 35 |
global articles |
| 36 |
article_numbers = [] |
| 37 |
MainW.set_status(group, '', '0') |
| 38 |
articles = Newsbase.ListArts(group) |
| 39 |
if isinstance(articles, types.StringType): |
| 40 |
return articles |
| 41 |
if articles == []: |
| 42 |
return 0 |
| 43 |
MainW.set_status(group, '?', str(len(articles))) |
| 44 |
init() |
| 45 |
if fast: |
| 46 |
wimp.hourglass_on() |
| 47 |
h = {} |
| 48 |
for article in articles: |
| 49 |
h['from'] = article[2] |
| 50 |
h['subject'] = article[3] |
| 51 |
h['date'] = article[5] |
| 52 |
process_headers(h) |
| 53 |
end(group, path, 1) |
| 54 |
else: |
| 55 |
art_no = 0 |
| 56 |
for article in articles: |
| 57 |
wimp.handle(wimp.poll(0)) |
| 58 |
success, details = Newsbase.GetArticle(group, article[0]) |
| 59 |
if success: |
| 60 |
file(details) |
| 61 |
MainW.set_status(group, str(art_no), str(len(articles))) |
| 62 |
art_no = art_no + 1 |
| 63 |
else: |
| 64 |
abort() |
| 65 |
return details |
| 66 |
end(group, path) |
| 67 |
wimp.hourglass_off() |
| 68 |
|
| 69 |
|
| 70 |
def init(): |
| 71 |
"Get ready to analyse a group" |
| 72 |
global headers, first, last |
| 73 |
headers = {} |
| 74 |
first = {} |
| 75 |
last = {} |
| 76 |
|
| 77 |
|
| 78 |
def file(file): |
| 79 |
"Load an article file" |
| 80 |
|
| 81 |
# Open the article |
| 82 |
f = open(file, 'r') |
| 83 |
|
| 84 |
# Read in the headers |
| 85 |
m = rfc822.Message(f) |
| 86 |
process_headers(m) |
| 87 |
|
| 88 |
# Close file |
| 89 |
f.close() |
| 90 |
|
| 91 |
|
| 92 |
def process_headers(m): |
| 93 |
"Process a dictionary of headers" |
| 94 |
# Increment the header dictionaries |
| 95 |
for (header, content) in m.items(): |
| 96 |
if '\n' in content or '\t' in content: |
| 97 |
content = re.sub(space, ' ', content) |
| 98 |
if not header in ignore_headers: |
| 99 |
if headers.has_key(header): |
| 100 |
header_d = headers[header] |
| 101 |
else: |
| 102 |
header_d = {} |
| 103 |
headers[header] = header_d |
| 104 |
if header == 'subject': |
| 105 |
content = re.sub(ref, '', content) |
| 106 |
elif header == 'lines': |
| 107 |
content = eval(content) |
| 108 |
if header_d.has_key(content): |
| 109 |
header_d[content] = header_d[content] + 1 |
| 110 |
else: |
| 111 |
header_d[content] = 1 |
| 112 |
|
| 113 |
# Check dates for durations |
| 114 |
if m.has_key('subject') and m.has_key('date'): |
| 115 |
subject = re.sub(ref, '', m['subject']) |
| 116 |
# fix century-less dates |
| 117 |
date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date'])) |
| 118 |
try: |
| 119 |
time = mx.DateTime.ARPA.ParseDateTime(date) |
| 120 |
if first.has_key(subject): |
| 121 |
if time < first[subject]: |
| 122 |
first[subject] = time |
| 123 |
else: |
| 124 |
first[subject] = time |
| 125 |
if last.has_key(subject): |
| 126 |
if time > last[subject]: |
| 127 |
last[subject] = time |
| 128 |
else: |
| 129 |
last[subject] = time |
| 130 |
except ValueError: |
| 131 |
# malformed dates raise ValueError |
| 132 |
pass |
| 133 |
|
| 134 |
|
| 135 |
def end(group, dir, fast = 0): |
| 136 |
"Analyse the collected data and write to file" |
| 137 |
global headers, first, last |
| 138 |
|
| 139 |
# Calculate durations |
| 140 |
durations = {} |
| 141 |
for subject in headers['subject'].items(): |
| 142 |
try: |
| 143 |
durations[subject[0]] = last[subject[0]] - first[subject[0]] |
| 144 |
except KeyError: |
| 145 |
pass |
| 146 |
|
| 147 |
# Sort the headers |
| 148 |
headers_list = headers.items() |
| 149 |
if not fast: |
| 150 |
headers_list.append(('=', {})) |
| 151 |
headers_list.append(('duration', durations)) |
| 152 |
headers_list.sort(_headersort) |
| 153 |
|
| 154 |
# Start the index file |
| 155 |
os.makedirs(dir) |
| 156 |
o = open(dir + '.index' + extension, 'w') |
| 157 |
o.write((wimp.m['output.index.head'] + '\n') % (group, group)) |
| 158 |
|
| 159 |
# Write a file for each header |
| 160 |
header_number = 1 |
| 161 |
for header in headers_list: |
| 162 |
if header[0] == '=': |
| 163 |
o.write(wimp.m['output.index.separator'] + '\n') |
| 164 |
continue |
| 165 |
|
| 166 |
o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0])) |
| 167 |
|
| 168 |
p = open(dir + '.' + str(header_number) + extension, 'w') |
| 169 |
p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0])) |
| 170 |
|
| 171 |
list = header[1].items() |
| 172 |
if header[0] == 'lines': |
| 173 |
list.sort() |
| 174 |
lines_previous = list[0][0] - 1 |
| 175 |
else: |
| 176 |
list.sort(_valuesort) |
| 177 |
value_length = len(str(list[0][1])) |
| 178 |
for item in list: |
| 179 |
text = (str(item[0]) + ' ' * 64)[:63] |
| 180 |
if '<' in text: |
| 181 |
text = re.sub(lt, '<', text) |
| 182 |
if '>' in text: |
| 183 |
text = re.sub(gt, '>', text) |
| 184 |
if header[0] == 'lines': |
| 185 |
if lines_previous != item[0] - 1: |
| 186 |
p.write(wimp.m['output.page.gap'] + '\n') |
| 187 |
lines_previous = item[0] |
| 188 |
if header[0] == 'duration': |
| 189 |
duration = int(item[1].hours) |
| 190 |
if duration == 0: |
| 191 |
p.write((wimp.m['output.page.line-0'] + '\n') % |
| 192 |
(text, (' ' * value_length + str(item[1]))[-value_length:])) |
| 193 |
else: |
| 194 |
p.write((wimp.m['output.page.line-d'] + '\n') % |
| 195 |
(text, (' ' * value_length + str(item[1]))[-value_length:], duration)) |
| 196 |
else: |
| 197 |
p.write((wimp.m['output.page.line'] + '\n') % |
| 198 |
(text, (' ' * value_length + str(item[1]))[-value_length:], item[1])) |
| 199 |
p.write(wimp.m['output.page.foot'] + '\n') |
| 200 |
|
| 201 |
p.close() |
| 202 |
os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF') |
| 203 |
|
| 204 |
header_number = header_number + 1 |
| 205 |
|
| 206 |
# Finish the file |
| 207 |
o.write(wimp.m['output.index.foot'] + '\n') |
| 208 |
|
| 209 |
# Close the output file |
| 210 |
o.close() |
| 211 |
os.system('SetType ' + dir + '.index' + extension + ' FAF') |
| 212 |
|
| 213 |
del headers, first, last |
| 214 |
|
| 215 |
|
| 216 |
def abort(): |
| 217 |
"Abort analysis" |
| 218 |
global headers, first, last |
| 219 |
del headers, first, last |
| 220 |
|
| 221 |
|
| 222 |
def _headersort(item1, item2): |
| 223 |
"Sort the headers" |
| 224 |
if item1[0] == 'subject': |
| 225 |
return -1 |
| 226 |
elif item2[0] == 'subject': |
| 227 |
return 1 |
| 228 |
elif item1[0] == 'from': |
| 229 |
return -1 |
| 230 |
elif item2[0] == 'from': |
| 231 |
return 1 |
| 232 |
elif item1[0] == 'lines': |
| 233 |
return -1 |
| 234 |
elif item2[0] == 'lines': |
| 235 |
return 1 |
| 236 |
elif item1[0] == 'duration': |
| 237 |
return -1 |
| 238 |
elif item2[0] == 'duration': |
| 239 |
return 1 |
| 240 |
elif item1[0] == '=': |
| 241 |
return -1 |
| 242 |
elif item2[0] == '=': |
| 243 |
return 1 |
| 244 |
elif item1[0] < item2[0]: |
| 245 |
return -1 |
| 246 |
return 1 |
| 247 |
|
| 248 |
|
| 249 |
def _valuesort(item1, item2): |
| 250 |
"Sort a former dictionary" |
| 251 |
if item1[1] < item2[1]: |
| 252 |
return 1 |
| 253 |
elif item1[1] == item2[1]: |
| 254 |
s1 = string.lower(item1[0]) |
| 255 |
s2 = string.lower(item2[0]) |
| 256 |
if s1 > s2: |
| 257 |
return 1 |
| 258 |
elif s1 == s2: |
| 259 |
return 0 |
| 260 |
return -1 |