| 1 |
james |
17 |
# > !NewsStats.py.Analyse/py |
| 2 |
|
|
|
| 3 |
|
|
import os |
| 4 |
|
|
import re |
| 5 |
|
|
import rfc822 |
| 6 |
|
|
import string |
| 7 |
|
|
import types |
| 8 |
|
|
import wimp |
| 9 |
|
|
import mx.DateTime |
| 10 |
|
|
import MainW |
| 11 |
|
|
import Newsbase |
| 12 |
|
|
|
| 13 |
|
|
# Don't analyse these headers |
| 14 |
|
|
ignore_headers = ('date', |
| 15 |
|
|
'message-id', |
| 16 |
|
|
'nntp-posting-date', |
| 17 |
|
|
'path', |
| 18 |
|
|
'received', |
| 19 |
|
|
'references', |
| 20 |
|
|
'xref', |
| 21 |
|
|
'x-trace') |
| 22 |
|
|
# Some regular expressions |
| 23 |
|
|
lt = re.compile('<') |
| 24 |
|
|
gt = re.compile('>') |
| 25 |
|
|
ref = re.compile('^Re: ', re.IGNORECASE) |
| 26 |
|
|
date1 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([89]\d )") |
| 27 |
|
|
date2 = re.compile(r"^((\w\w\w,)? *\d\d? \w\w\w) ([0-7]\d )") |
| 28 |
|
|
space = re.compile("(\n\s*)|(\t+)") |
| 29 |
|
|
|
| 30 |
|
|
extension = '/html' |
| 31 |
|
|
|
| 32 |
|
|
|
| 33 |
|
|
def group(group, path, fast): |
| 34 |
|
|
"Analyse a group" |
| 35 |
|
|
global articles |
| 36 |
|
|
article_numbers = [] |
| 37 |
|
|
MainW.set_status(group, '', '0') |
| 38 |
|
|
articles = Newsbase.ListArts(group) |
| 39 |
|
|
if isinstance(articles, types.StringType): |
| 40 |
|
|
return articles |
| 41 |
|
|
if articles == []: |
| 42 |
|
|
return 0 |
| 43 |
|
|
MainW.set_status(group, '?', str(len(articles))) |
| 44 |
|
|
init() |
| 45 |
|
|
if fast: |
| 46 |
|
|
wimp.hourglass_on() |
| 47 |
|
|
h = {} |
| 48 |
|
|
for article in articles: |
| 49 |
|
|
h['from'] = article[2] |
| 50 |
|
|
h['subject'] = article[3] |
| 51 |
|
|
h['date'] = article[5] |
| 52 |
|
|
process_headers(h) |
| 53 |
|
|
end(group, path, 1) |
| 54 |
|
|
else: |
| 55 |
|
|
art_no = 0 |
| 56 |
|
|
for article in articles: |
| 57 |
|
|
wimp.handle(wimp.poll(0)) |
| 58 |
|
|
success, details = Newsbase.GetArticle(group, article[0]) |
| 59 |
|
|
if success: |
| 60 |
|
|
file(details) |
| 61 |
|
|
MainW.set_status(group, str(art_no), str(len(articles))) |
| 62 |
|
|
art_no = art_no + 1 |
| 63 |
|
|
else: |
| 64 |
|
|
abort() |
| 65 |
|
|
return details |
| 66 |
|
|
end(group, path) |
| 67 |
|
|
wimp.hourglass_off() |
| 68 |
|
|
|
| 69 |
|
|
|
| 70 |
|
|
def init(): |
| 71 |
|
|
"Get ready to analyse a group" |
| 72 |
|
|
global headers, first, last |
| 73 |
|
|
headers = {} |
| 74 |
|
|
first = {} |
| 75 |
|
|
last = {} |
| 76 |
|
|
|
| 77 |
|
|
|
| 78 |
|
|
def file(file): |
| 79 |
|
|
"Load an article file" |
| 80 |
|
|
|
| 81 |
|
|
# Open the article |
| 82 |
|
|
f = open(file, 'r') |
| 83 |
|
|
|
| 84 |
|
|
# Read in the headers |
| 85 |
|
|
m = rfc822.Message(f) |
| 86 |
|
|
process_headers(m) |
| 87 |
|
|
|
| 88 |
|
|
# Close file |
| 89 |
|
|
f.close() |
| 90 |
|
|
|
| 91 |
|
|
|
| 92 |
|
|
def process_headers(m): |
| 93 |
|
|
"Process a dictionary of headers" |
| 94 |
|
|
# Increment the header dictionaries |
| 95 |
|
|
for (header, content) in m.items(): |
| 96 |
|
|
if '\n' in content or '\t' in content: |
| 97 |
|
|
content = re.sub(space, ' ', content) |
| 98 |
|
|
if not header in ignore_headers: |
| 99 |
|
|
if headers.has_key(header): |
| 100 |
|
|
header_d = headers[header] |
| 101 |
|
|
else: |
| 102 |
|
|
header_d = {} |
| 103 |
|
|
headers[header] = header_d |
| 104 |
|
|
if header == 'subject': |
| 105 |
|
|
content = re.sub(ref, '', content) |
| 106 |
|
|
elif header == 'lines': |
| 107 |
|
|
content = eval(content) |
| 108 |
|
|
if header_d.has_key(content): |
| 109 |
|
|
header_d[content] = header_d[content] + 1 |
| 110 |
|
|
else: |
| 111 |
|
|
header_d[content] = 1 |
| 112 |
|
|
|
| 113 |
|
|
# Check dates for durations |
| 114 |
|
|
if m.has_key('subject') and m.has_key('date'): |
| 115 |
|
|
subject = re.sub(ref, '', m['subject']) |
| 116 |
|
|
# fix century-less dates |
| 117 |
|
|
date = re.sub(date2, r"\1 20\3", re.sub(date1, r"\1 19\3", m['date'])) |
| 118 |
|
|
try: |
| 119 |
|
|
time = mx.DateTime.ARPA.ParseDateTime(date) |
| 120 |
|
|
if first.has_key(subject): |
| 121 |
|
|
if time < first[subject]: |
| 122 |
|
|
first[subject] = time |
| 123 |
|
|
else: |
| 124 |
|
|
first[subject] = time |
| 125 |
|
|
if last.has_key(subject): |
| 126 |
|
|
if time > last[subject]: |
| 127 |
|
|
last[subject] = time |
| 128 |
|
|
else: |
| 129 |
|
|
last[subject] = time |
| 130 |
|
|
except ValueError: |
| 131 |
|
|
# malformed dates raise ValueError |
| 132 |
|
|
pass |
| 133 |
|
|
|
| 134 |
|
|
|
| 135 |
|
|
def end(group, dir, fast = 0): |
| 136 |
|
|
"Analyse the collected data and write to file" |
| 137 |
|
|
global headers, first, last |
| 138 |
|
|
|
| 139 |
|
|
# Calculate durations |
| 140 |
|
|
durations = {} |
| 141 |
|
|
for subject in headers['subject'].items(): |
| 142 |
|
|
try: |
| 143 |
|
|
durations[subject[0]] = last[subject[0]] - first[subject[0]] |
| 144 |
|
|
except KeyError: |
| 145 |
|
|
pass |
| 146 |
|
|
|
| 147 |
|
|
# Sort the headers |
| 148 |
|
|
headers_list = headers.items() |
| 149 |
|
|
if not fast: |
| 150 |
|
|
headers_list.append(('=', {})) |
| 151 |
|
|
headers_list.append(('duration', durations)) |
| 152 |
|
|
headers_list.sort(_headersort) |
| 153 |
|
|
|
| 154 |
|
|
# Start the index file |
| 155 |
|
|
os.makedirs(dir) |
| 156 |
|
|
o = open(dir + '.index' + extension, 'w') |
| 157 |
|
|
o.write((wimp.m['output.index.head'] + '\n') % (group, group)) |
| 158 |
|
|
|
| 159 |
|
|
# Write a file for each header |
| 160 |
|
|
header_number = 1 |
| 161 |
|
|
for header in headers_list: |
| 162 |
|
|
if header[0] == '=': |
| 163 |
|
|
o.write(wimp.m['output.index.separator'] + '\n') |
| 164 |
|
|
continue |
| 165 |
|
|
|
| 166 |
|
|
o.write((wimp.m['output.index.line'] + '\n') % (str(header_number), header[0])) |
| 167 |
|
|
|
| 168 |
|
|
p = open(dir + '.' + str(header_number) + extension, 'w') |
| 169 |
|
|
p.write((wimp.m['output.page.head'] + '\n') % (group, header[0], group, header[0])) |
| 170 |
|
|
|
| 171 |
|
|
list = header[1].items() |
| 172 |
|
|
if header[0] == 'lines': |
| 173 |
|
|
list.sort() |
| 174 |
|
|
lines_previous = list[0][0] - 1 |
| 175 |
|
|
else: |
| 176 |
|
|
list.sort(_valuesort) |
| 177 |
|
|
value_length = len(str(list[0][1])) |
| 178 |
|
|
for item in list: |
| 179 |
|
|
text = (str(item[0]) + ' ' * 64)[:63] |
| 180 |
|
|
if '<' in text: |
| 181 |
|
|
text = re.sub(lt, '<', text) |
| 182 |
|
|
if '>' in text: |
| 183 |
|
|
text = re.sub(gt, '>', text) |
| 184 |
|
|
if header[0] == 'lines': |
| 185 |
|
|
if lines_previous != item[0] - 1: |
| 186 |
|
|
p.write(wimp.m['output.page.gap'] + '\n') |
| 187 |
|
|
lines_previous = item[0] |
| 188 |
|
|
if header[0] == 'duration': |
| 189 |
|
|
duration = int(item[1].hours) |
| 190 |
|
|
if duration == 0: |
| 191 |
|
|
p.write((wimp.m['output.page.line-0'] + '\n') % |
| 192 |
|
|
(text, (' ' * value_length + str(item[1]))[-value_length:])) |
| 193 |
|
|
else: |
| 194 |
|
|
p.write((wimp.m['output.page.line-d'] + '\n') % |
| 195 |
|
|
(text, (' ' * value_length + str(item[1]))[-value_length:], duration)) |
| 196 |
|
|
else: |
| 197 |
|
|
p.write((wimp.m['output.page.line'] + '\n') % |
| 198 |
|
|
(text, (' ' * value_length + str(item[1]))[-value_length:], item[1])) |
| 199 |
|
|
p.write(wimp.m['output.page.foot'] + '\n') |
| 200 |
|
|
|
| 201 |
|
|
p.close() |
| 202 |
|
|
os.system('SetType ' + dir + '.' + str(header_number) + extension + ' FAF') |
| 203 |
|
|
|
| 204 |
|
|
header_number = header_number + 1 |
| 205 |
|
|
|
| 206 |
|
|
# Finish the file |
| 207 |
|
|
o.write(wimp.m['output.index.foot'] + '\n') |
| 208 |
|
|
|
| 209 |
|
|
# Close the output file |
| 210 |
|
|
o.close() |
| 211 |
|
|
os.system('SetType ' + dir + '.index' + extension + ' FAF') |
| 212 |
|
|
|
| 213 |
|
|
del headers, first, last |
| 214 |
|
|
|
| 215 |
|
|
|
| 216 |
|
|
def abort(): |
| 217 |
|
|
"Abort analysis" |
| 218 |
|
|
global headers, first, last |
| 219 |
|
|
del headers, first, last |
| 220 |
|
|
|
| 221 |
|
|
|
| 222 |
|
|
def _headersort(item1, item2): |
| 223 |
|
|
"Sort the headers" |
| 224 |
|
|
if item1[0] == 'subject': |
| 225 |
|
|
return -1 |
| 226 |
|
|
elif item2[0] == 'subject': |
| 227 |
|
|
return 1 |
| 228 |
|
|
elif item1[0] == 'from': |
| 229 |
|
|
return -1 |
| 230 |
|
|
elif item2[0] == 'from': |
| 231 |
|
|
return 1 |
| 232 |
|
|
elif item1[0] == 'lines': |
| 233 |
|
|
return -1 |
| 234 |
|
|
elif item2[0] == 'lines': |
| 235 |
|
|
return 1 |
| 236 |
|
|
elif item1[0] == 'duration': |
| 237 |
|
|
return -1 |
| 238 |
|
|
elif item2[0] == 'duration': |
| 239 |
|
|
return 1 |
| 240 |
|
|
elif item1[0] == '=': |
| 241 |
|
|
return -1 |
| 242 |
|
|
elif item2[0] == '=': |
| 243 |
|
|
return 1 |
| 244 |
|
|
elif item1[0] < item2[0]: |
| 245 |
|
|
return -1 |
| 246 |
|
|
return 1 |
| 247 |
|
|
|
| 248 |
|
|
|
| 249 |
|
|
def _valuesort(item1, item2): |
| 250 |
|
|
"Sort a former dictionary" |
| 251 |
|
|
if item1[1] < item2[1]: |
| 252 |
|
|
return 1 |
| 253 |
|
|
elif item1[1] == item2[1]: |
| 254 |
|
|
s1 = string.lower(item1[0]) |
| 255 |
|
|
s2 = string.lower(item2[0]) |
| 256 |
|
|
if s1 > s2: |
| 257 |
|
|
return 1 |
| 258 |
|
|
elif s1 == s2: |
| 259 |
|
|
return 0 |
| 260 |
|
|
return -1 |