summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAntti Ajanki <antti.ajanki@iki.fi>2013-08-06 18:05:44 +0300
committerAntti Ajanki <antti.ajanki@iki.fi>2013-08-06 18:05:44 +0300
commit27b116a64069ba0c0d734073fd33b5fb67958024 (patch)
treed96f7be107b119467aa9819a16db9b9bf8863fc7 /src
parent1ea55bb8190e782940ff893ac8d492acabbbc886 (diff)
downloadvdr-plugin-webvideo-27b116a64069ba0c0d734073fd33b5fb67958024.tar.gz
vdr-plugin-webvideo-27b116a64069ba0c0d734073fd33b5fb67958024.tar.bz2
Hide duplicate links
Diffstat (limited to 'src')
-rw-r--r--src/libwebvi/linkextractor.c49
1 files changed, 49 insertions, 0 deletions
diff --git a/src/libwebvi/linkextractor.c b/src/libwebvi/linkextractor.c
index c11b926..b0e4270 100644
--- a/src/libwebvi/linkextractor.c
+++ b/src/libwebvi/linkextractor.c
@@ -27,6 +27,8 @@ static void get_links_recursively(TidyDoc tdoc,
GPtrArray *links_found);
static gchar *parse_link_title(TidyDoc tdoc, TidyNode node);
static void get_text_content(TidyDoc tdoc, TidyNode node, TidyBuffer* buf);
+static void remove_duplicate_urls(GPtrArray *links);
+static void insert_links_with_longest_titles(gpointer data, gpointer userdata);
LinkExtractor *link_extractor_create(const LinkTemplates *link_templates, const gchar *baseurl) {
LinkExtractor *extractor;
@@ -67,6 +69,7 @@ GPtrArray *link_extractor_get_links(LinkExtractor *self) {
err = tidyCleanAndRepair(tdoc);
if ( err >= 0 ) {
links = extract_links(self, tdoc);
+ remove_duplicate_urls(links);
}
}
@@ -147,6 +150,52 @@ void get_text_content(TidyDoc tdoc, TidyNode node, TidyBuffer* buf) {
}
}
+void remove_duplicate_urls(GPtrArray *links) {
+ /* Remove links with duplicated URLs. Keep the link with the longest
+ * title on the assumption that it is more informative. Keep the
+ * sort order. */
+
+ if (links->len <= 1)
+ return;
+
+ /* seen_urls maps an href (char *) to Link * which has the longest
+ * title. Both keys and values are borrowed references!
+ */
+ GHashTable *seen_urls = g_hash_table_new(g_str_hash, g_str_equal);
+ g_ptr_array_foreach(links, insert_links_with_longest_titles, seen_urls);
+
+ /* Delete links which are not in the hash table */
+ int i = 0;
+ while (i < links->len) {
+ Link *link = g_ptr_array_index(links, i);
+ const char *href = link_get_href(link);
+ Link *link_with_longest_title =
+ (Link *)g_hash_table_lookup(seen_urls, href);
+
+ if (link == link_with_longest_title) {
+ i++;
+ } else {
+ g_ptr_array_remove_index(links, i);
+ }
+ }
+
+ g_hash_table_unref(seen_urls);
+}
+
+void insert_links_with_longest_titles(gpointer data, gpointer userdata) {
+ Link *link = (Link *)data;
+ GHashTable *hashtable = (GHashTable *)userdata;
+ int title_len = strlen(link_get_title(link));
+ const char *href = link_get_href(link);
+ Link *prev_link = (Link *)g_hash_table_lookup(hashtable, href);
+ int prev_link_title_len = 0;
+ if (prev_link)
+ prev_link_title_len = strlen(link_get_title(prev_link));
+ if (!prev_link || (prev_link_title_len < title_len)) {
+ g_hash_table_replace(hashtable, (gpointer)href, (gpointer)link);
+ }
+}
+
void free_link(gpointer p) {
link_delete((Link *)p);
}