/* * This file is part of Sargasso, http://zamez.org/sargasso * Licensed under the GNU General Public License, * http://www.opensource.org/licenses/gpl-license * Copyright 2006 James Bursa */ #include #include #include #include #include #include #include #include "feed.h" unsigned int feed_count = 0; struct feed *feeds = 0; bool feed_work_needed = false; const char *feed_error = 0; static const char *feed_status_name[] = { "NEW", "FETCHING", "OK", "ERROR" }; static CURLM *curl_multi_handle; static unsigned int fetching = 0; #define MAX_FETCHES 3 static void feed_set_status(struct feed *feed, feed_status status); static void feed_work_feed(struct feed *feed); static void feed_create_fetch(struct feed *feed); static void feed_start_fetch(struct feed *feed); static size_t feed_header_callback(void *ptr, size_t size, size_t nmemb, void *stream); static size_t feed_write_callback(void *ptr, size_t size, size_t nmemb, void *stream); static void feed_fetched(struct feed *feed, CURLcode result); static void feed_parse(struct feed *feed); static void feed_parse_item(struct feed *feed, xmlNode *node); static void feed_free_item(struct feed *feed, unsigned int i); static void feed_clean_text(xmlChar *text); /** * Initialise the feed module. */ bool feed_init(void) { CURLcode code; code = curl_global_init(CURL_GLOBAL_ALL); if (code != CURLE_OK) { feed_error = curl_easy_strerror(code); return false; } curl_multi_handle = curl_multi_init(); if (!curl_multi_handle) { feed_error = "Failed to initialise curl"; return false; } xmlInitParser(); return true; } /** * Quit the feed module. */ void feed_quit(void) { while (feed_count) feed_remove(0); free(feeds); feeds = 0; xmlCleanupParser(); curl_multi_cleanup(curl_multi_handle); } /** * Add a new feed. */ bool feed_add(const char *url) { struct feed *feeds1; struct feed *feed; char *url1; unsigned int i; assert(url); feeds1 = realloc(feeds, sizeof *feed * (feed_count + 1)); if (!feeds1) { feed_error = "Out of memory"; return false; } feeds = feeds1; url1 = strdup(url); if (!url1) { feed_error = "Out of memory"; free(url1); return false; } feed = &feeds[feed_count]; feed->url = url1; feed->status = FEED_NEW; feed->error = 0; feed->status_line = 0; feed->etag = 0; feed->redirect = 0; feed->data = 0; feed->data_size = 0; feed->title = 0; feed->description = 0; feed->link = 0; feed->copyright = 0; feed->pub_date = 0; feed->category = 0; for (i = 0; i != FEED_MAX_ITEMS; i++) { feed->item[i].title = 0; feed->item[i].description = 0; feed->item[i].link = 0; feed->item[i].author = 0; feed->item[i].pub_date = 0; feed->item[i].category = 0; feed->item[i].guid = 0; feed->item[i].new_item = false; } feed->item_count = 0; feed_count++; feed_work_needed = true; printf("added feed %s\n", url); return true; } /** * Remove a feed. */ bool feed_remove(unsigned int i) { unsigned int j; assert(i < feed_count); if (feeds[i].status == FEED_FETCHING) { curl_multi_remove_handle(curl_multi_handle, feeds[i].curl); curl_easy_cleanup(feeds[i].curl); curl_slist_free_all(feeds[i].headers); feeds[i].headers = 0; } for (j = 0; j != feeds[i].item_count; j++) feed_free_item(&feeds[i], j); free(feeds[i].url); free(feeds[i].status_line); free(feeds[i].etag); free(feeds[i].redirect); free(feeds[i].data); if (feeds[i].title) xmlFree(feeds[i].title); if (feeds[i].description) xmlFree(feeds[i].description); if (feeds[i].link) xmlFree(feeds[i].link); if (feeds[i].copyright) xmlFree(feeds[i].copyright); if (feeds[i].pub_date) xmlFree(feeds[i].pub_date); if (feeds[i].category) xmlFree(feeds[i].category); if (i != feed_count - 1) memmove(&feeds[i], &feeds[i + 1], (sizeof feeds[0]) * (feed_count - i - 1)); feed_count--; return true; } /** * Set the status of a feed. */ void feed_set_status(struct feed *feed, feed_status status) { printf("status %s %s => %s\n", feed->url, feed_status_name[feed->status], feed_status_name[status]); feed->status = status; feed->updated = true; } /** * Do some work on the feeds. */ bool feed_work(void) { unsigned int i; int running; int queue; CURLMsg *msg; struct feed *feed; for (i = 0; i != feed_count; i++) { feeds[i].updated = false; feed_work_feed(&feeds[i]); } feed_work_needed = false; while (curl_multi_perform(curl_multi_handle, &running) == CURLM_CALL_MULTI_PERFORM) continue; if ((msg = curl_multi_info_read(curl_multi_handle, &queue))) { if (msg->msg == CURLMSG_DONE) { curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, &feed); feed_fetched(feed, msg->data.result); } } for (i = 0; i != feed_count; i++) if (feeds[i].status == FEED_NEW || feeds[i].status == FEED_FETCHING || feeds[i].status == FEED_UPDATE) feed_work_needed = true; for (i = 0; i != feed_count; i++) if (feeds[i].updated) return true; return false; } /** * Do some work on a feed. */ void feed_work_feed(struct feed *feed) { assert(feed); if ((feed->status == FEED_NEW || feed->status == FEED_UPDATE) && fetching < MAX_FETCHES) { feed_create_fetch(feed); if (feed->status != FEED_ERROR) feed_start_fetch(feed); } } /** * Create a fetch for a feed. */ void feed_create_fetch(struct feed *feed) { CURL *curl; struct curl_slist *headers = 0; struct curl_slist *headers2 = 0; curl = curl_easy_init(); if (!curl) { feed_set_status(feed, FEED_ERROR); feed->error = "Failed to create curl session"; return; } headers2 = curl_slist_append(headers, "Accept: " "application/rss+xml, application/xml, text/xml"); if (!headers2) { curl_easy_cleanup(feed); curl_slist_free_all(headers); feed_set_status(feed, FEED_ERROR); feed->error = "Out of memory"; return; } headers = headers2; if (feed->etag) { size_t n = 20 + strlen(feed->etag); char if_none_match[n]; snprintf(if_none_match, n, "If-None-Match: %s", feed->etag); headers2 = curl_slist_append(headers, if_none_match); if (!headers2) { curl_easy_cleanup(curl); curl_slist_free_all(headers); feed_set_status(feed, FEED_ERROR); feed->error = "Out of memory"; return; } headers = headers2; } curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L); curl_easy_setopt(curl, CURLOPT_URL, feed->url); curl_easy_setopt(curl, CURLOPT_PRIVATE, feed); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, feed_write_callback); curl_easy_setopt(curl, CURLOPT_WRITEDATA, feed); curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, feed_header_callback); curl_easy_setopt(curl, CURLOPT_HEADERDATA, feed); curl_easy_setopt(curl, CURLOPT_USERAGENT, "Sargasso (http://zamez.org/sargasso)"); curl_easy_setopt(curl, CURLOPT_ENCODING, "gzip"); curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L); curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 60L); curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L); curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30L); feed->curl = curl; feed->headers = headers; feed->redirect_count = 0; } /** * Start fetching a feed. */ void feed_start_fetch(struct feed *feed) { CURLMcode mcode; mcode = curl_multi_add_handle(curl_multi_handle, feed->curl); if (mcode != CURLM_OK) { feed_set_status(feed, FEED_ERROR); feed->error = curl_multi_strerror(mcode); curl_easy_cleanup(feed->curl); curl_slist_free_all(feed->headers); feed->curl = 0; feed->headers = 0; return; } free(feed->status_line); feed->status_line = 0; feed->data_size = 0; feed->error = 0; feed_set_status(feed, FEED_FETCHING); printf("fetching feed %s\n", feed->url); fetching++; feed_work_needed = true; } /** * Callback for receiving headers for a feed. */ size_t feed_header_callback(void *ptr, size_t size, size_t nmemb, void *stream) { struct feed *feed = (struct feed *) stream; size_t n = size * nmemb; char header[n + 1]; char *value; strncpy(header, ptr, n); header[n] = 0; while (0 < n && header[n - 1] <= 32) header[--n] = 0; if (!feed->status_line) { feed->status_line = strdup(header); return size * nmemb; } value = strchr(header, ':'); if (!value) return size * nmemb; *value = 0; value++; while (isspace(*value)) value++; if (strcasecmp(header, "ETag") == 0 && value[0] == '"') { free(feed->etag); feed->etag = strdup(value); } else if (strcasecmp(header, "Location") == 0) { free(feed->redirect); feed->redirect = strdup(value); } return size * nmemb; } /** * Callback for receiving data for a feed. */ size_t feed_write_callback(void *ptr, size_t size, size_t nmemb, void *stream) { struct feed *feed = (struct feed *) stream; char *data; printf("received %u for %s\n", size * nmemb, feed->url); data = realloc(feed->data, feed->data_size + size * nmemb); if (!data) { feed_set_status(feed, FEED_ERROR); feed->error = "Out of memory"; return 0; } memcpy(data + feed->data_size, ptr, size * nmemb); feed->data = data; feed->data_size += size * nmemb; return size * nmemb; } /** * Process a complete feed fetch. */ void feed_fetched(struct feed *feed, CURLcode result) { long http_code; printf("finished %s with result %i %s\n", feed->url, result, curl_easy_strerror(result)); fetching--; if (result == CURLE_OK) { curl_easy_getinfo(feed->curl, CURLINFO_RESPONSE_CODE, &http_code); printf("HTTP code %li\n", http_code); if (http_code == 0 || http_code == 200 /* OK */) { feed_parse(feed); } else if (http_code == 300 /* Multiple Choices */ || http_code == 301 /* Moved Permanently */ || http_code == 302 /* Found */ || http_code == 303 /* See Other */ || http_code == 307 /* Temporary Redirect */) { if (feed->redirect_count++ == 5) { feed_set_status(feed, FEED_ERROR); feed->error = "Too many redirects."; } else if (feed->redirect) { curl_multi_remove_handle(curl_multi_handle, feed->curl); curl_easy_setopt(feed->curl, CURLOPT_URL, feed->redirect); feed_start_fetch(feed); if (http_code == 301 /* Moved Permanently */) { free(feed->url); feed->url = feed->redirect; feed->redirect = 0; } return; } else { feed_set_status(feed, FEED_ERROR); feed->error = "Invalid redirect."; } } else if (http_code == 304 /* Not Modified */) { feed_set_status(feed, FEED_OK); } else { feed_set_status(feed, FEED_ERROR); if (feed->status_line) feed->error = feed->status_line; else feed->error = "Response not understood."; } } else { feed_set_status(feed, FEED_ERROR); if (!feed->error) feed->error = curl_easy_strerror(result); } curl_multi_remove_handle(curl_multi_handle, feed->curl); curl_easy_cleanup(feed->curl); feed->curl = 0; curl_slist_free_all(feed->headers); feed->headers = 0; free(feed->data); feed->data = 0; } /** * Parse a feed's XML. */ void feed_parse(struct feed *feed) { xmlDoc *doc; xmlNode *rss; xmlNode *channel; xmlNode *node; assert(feed); assert(feed->status == FEED_FETCHING); doc = xmlReadMemory(feed->data, feed->data_size, feed->url, 0, 0); if (!doc) { feed_set_status(feed, FEED_ERROR); feed->error = "failed to parse XML"; return; } //xmlDebugDumpDocument(stdout, doc); free(feed->data); feed->data = 0; for (rss = doc->children; rss; rss = rss->next) if (rss->type == XML_ELEMENT_NODE && !strcmp(rss->name, "rss")) break; if (!rss) { feed_set_status(feed, FEED_ERROR); feed->error = "rss element not found"; xmlFreeDoc(doc); return; } for (channel = rss->children; channel; channel = channel->next) if (channel->type == XML_ELEMENT_NODE && !strcmp(channel->name, "channel")) break; if (!channel) { feed_set_status(feed, FEED_ERROR); feed->error = "channel element not found"; xmlFreeDoc(doc); return; } for (node = channel->last; node; node = node->prev) { if (node->type != XML_ELEMENT_NODE) continue; if (!strcmp(node->name, "title")) { if (feed->title) xmlFree(feed->title); feed->title = xmlNodeGetContent(node); } else if (!strcmp(node->name, "description")) { if (feed->description) xmlFree(feed->description); feed->description = xmlNodeGetContent(node); } else if (!strcmp(node->name, "link")) { if (feed->link) xmlFree(feed->link); feed->link = xmlNodeGetContent(node); } else if (!strcmp(node->name, "copyright")) { if (feed->copyright) xmlFree(feed->copyright); feed->copyright = xmlNodeGetContent(node); } else if (!strcmp(node->name, "pubDate")) { if (feed->pub_date) xmlFree(feed->pub_date); feed->pub_date = xmlNodeGetContent(node); } else if (!strcmp(node->name, "category")) { if (feed->category) xmlFree(feed->category); feed->category = xmlNodeGetContent(node); } else if (!strcmp(node->name, "item")) { feed_parse_item(feed, node); } } xmlFreeDoc(doc); feed_clean_text(feed->title); feed_clean_text(feed->description); feed_clean_text(feed->link); feed_clean_text(feed->copyright); feed_clean_text(feed->category); feed_set_status(feed, FEED_OK); } void feed_parse_item(struct feed *feed, xmlNode *node) { xmlNode *child; xmlChar *title = 0; xmlChar *description = 0; xmlChar *link = 0; xmlChar *author = 0; xmlChar *pub_date = 0; xmlChar *category = 0; xmlChar *guid = 0; unsigned int i; for (child = node->children; child; child = child->next) { if (child->type != XML_ELEMENT_NODE) continue; if (!strcmp(child->name, "title")) title = xmlNodeGetContent(child); else if (!strcmp(child->name, "description")) description = xmlNodeGetContent(child); else if (!strcmp(child->name, "link")) link = xmlNodeGetContent(child); else if (!strcmp(child->name, "author")) author = xmlNodeGetContent(child); else if (!strcmp(child->name, "pubDate")) pub_date = xmlNodeGetContent(child); else if (!strcmp(child->name, "category")) category = xmlNodeGetContent(child); else if (!strcmp(child->name, "guid")) guid = xmlNodeGetContent(child); } feed_clean_text(title); feed_clean_text(description); feed_clean_text(link); feed_clean_text(author); feed_clean_text(category); feed_clean_text(guid); for (i = 0; i != feed->item_count; i++) { if (guid) { if (feed->item[i].guid && !strcmp(feed->item[i].guid, guid)) break; } else if (link) { if (feed->item[i].link && !strcmp(feed->item[i].link, link)) break; } } if (i != feed->item_count) { /* old item */ feed_free_item(feed, i); feed->item[i].title = title; feed->item[i].description = description; feed->item[i].link = link; feed->item[i].author = author; feed->item[i].pub_date = pub_date; feed->item[i].category = category; feed->item[i].guid = guid; } else { /* new item */ if (feed->item_count == FEED_MAX_ITEMS) feed_free_item(feed, FEED_MAX_ITEMS - 1); memmove(feed->item + 1, feed->item, sizeof *feed->item * (FEED_MAX_ITEMS - 1)); feed->item[0].title = title; feed->item[0].description = description; feed->item[0].link = link; feed->item[0].author = author; feed->item[0].pub_date = pub_date; feed->item[0].category = category; feed->item[0].guid = guid; feed->item[0].new_item = true; if (feed->item_count != FEED_MAX_ITEMS) feed->item_count++; } } void feed_free_item(struct feed *feed, unsigned int i) { if (feed->item[i].title) xmlFree(feed->item[i].title); if (feed->item[i].description) xmlFree(feed->item[i].description); if (feed->item[i].link) xmlFree(feed->item[i].link); if (feed->item[i].author) xmlFree(feed->item[i].author); if (feed->item[i].pub_date) xmlFree(feed->item[i].pub_date); if (feed->item[i].category) xmlFree(feed->item[i].category); if (feed->item[i].guid) xmlFree(feed->item[i].guid); } void feed_clean_text(xmlChar *text) { xmlChar *s, *d; if (!text) return; s = d = text; while (*s) { char *gt; if (*s == '<' && (gt = strchr(s, '>'))) { if (s[1] == '/' && s[2] == 't' && s[3] == 'd') *d++ = ' ', *d++ = '|', *d++ = ' '; else if (s[1] == 'b' && s[2] == 'r') *d++ = 0xe2, *d++ = 0x80, *d++ = 0x94; s = gt + 1; } else if (*s == '&') { if (s[1] == '#' && s[2] == '3' && s[3] == '9' && s[4] == ';') *d++ = '\'', s += 5; else if (s[1] == 'n' && s[2] == 'b' && s[3] == 's' && s[4] == 'p' && s[5] == ';') *d++ = ' ', s += 6; else if (s[1] == 'q' && s[2] == 'u' && s[3] == 'o' && s[4] == 't' && s[5] == ';') *d++ = '"', s += 6; else if (s[1] == 'a' && s[2] == 'm' && s[3] == 'p' && s[4] == ';') *d++ = '&', s += 5; else if (s[1] == 'c' && s[2] == 'o' && s[3] == 'p' && s[4] == 'y' && s[5] == ';') *d++ = 0xc2, *d++ = 0xa9, s += 6; else *d++ = *s++; } else *d++ = *s++; } *d = 0; /* collapse whitespace */ s = d = text; while (*s == '\t' || *s == '\r' || *s == '\n' || *s == ' ') s++; while (*s) { while (*s && !(*s == '\t' || *s == '\r' || *s == '\n' || *s == ' ')) *d++ = *s++; if (*s) *d++ = ' '; while (*s && (*s == '\t' || *s == '\r' || *s == '\n' || *s == ' ')) s++; } *d = 0; } /** * Start updating all feeds. */ void feed_update(void) { unsigned int i; for (i = 0; i != feed_count; i++) feeds[i].status = FEED_UPDATE; feed_work_needed = true; } /** * Load list of feeds. */ bool feed_list_load(const char *path) { FILE *stream; char url[4000]; stream = fopen(path, "r"); if (!stream) { feed_error = strerror(errno); return false; } while (!feof(stream)) { url[0] = 0; fgets(url, sizeof url, stream); if (url[0] == 0 || url[0] == '\n') continue; url[strlen(url) - 1] = 0; if (!feed_add(url)) { fclose(stream); return false; } } if (fclose(stream)) { feed_error = strerror(errno); return false; } return true; } /** * Save list of feeds. */ bool feed_list_save(const char *path) { FILE *stream; unsigned int i; stream = fopen(path, "w"); if (!stream) { feed_error = strerror(errno); return false; } for (i = 0; i != feed_count; i++) { fputs(feeds[i].url, stream); fputc('\n', stream); } if (fclose(stream)) { feed_error = strerror(errno); return false; } return true; } /** * Output a feed. */ void feed_print(struct feed *feed) { unsigned int i; assert(feed); printf("URL: %s\n", feed->url); printf("Status: %s\n", feed_status_name[feed->status]); if (feed->status == FEED_OK) { printf("Title: %s\n", feed->title); printf("Description: %s\n", feed->description); printf("Link: %s\n", feed->link); printf("Copyright: %s\n", feed->copyright); printf("Publ'n date: %s\n", feed->pub_date); printf("Category: %s\n", feed->category); for (i = 0; i != feed->item_count; i++) { printf(" Title: %s\n", feed->item[i].title); printf(" Description: %s\n", feed->item[i].description); printf(" Link: %s\n", feed->item[i].link); printf(" Author: %s\n", feed->item[i].author); printf(" Publ'n date: %s\n", feed->item[i].pub_date); printf(" Category: %s\n", feed->item[i].category); printf(" GUID: %s\n", feed->item[i].guid); printf(" New item: %s\n", feed->item[i].new_item ? "yes" : "no"); } } else if (feed->status == FEED_ERROR) { printf("Error: %s\n", feed->error); } }