summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAntti Ajanki <antti.ajanki@iki.fi>2013-08-06 18:05:44 +0300
committerAntti Ajanki <antti.ajanki@iki.fi>2013-08-06 18:05:44 +0300
commit27b116a64069ba0c0d734073fd33b5fb67958024 (patch)
treed96f7be107b119467aa9819a16db9b9bf8863fc7
parent1ea55bb8190e782940ff893ac8d492acabbbc886 (diff)
downloadvdr-plugin-webvideo-27b116a64069ba0c0d734073fd33b5fb67958024.tar.gz
vdr-plugin-webvideo-27b116a64069ba0c0d734073fd33b5fb67958024.tar.bz2
Hide duplicate links
-rw-r--r--src/libwebvi/linkextractor.c49
-rw-r--r--tests/libwebvi_tests.c4
-rw-r--r--tests/linkextractor_tests.c42
-rw-r--r--tests/linkextractor_tests.h2
4 files changed, 97 insertions, 0 deletions
diff --git a/src/libwebvi/linkextractor.c b/src/libwebvi/linkextractor.c
index c11b926..b0e4270 100644
--- a/src/libwebvi/linkextractor.c
+++ b/src/libwebvi/linkextractor.c
@@ -27,6 +27,8 @@ static void get_links_recursively(TidyDoc tdoc,
GPtrArray *links_found);
static gchar *parse_link_title(TidyDoc tdoc, TidyNode node);
static void get_text_content(TidyDoc tdoc, TidyNode node, TidyBuffer* buf);
+static void remove_duplicate_urls(GPtrArray *links);
+static void insert_links_with_longest_titles(gpointer data, gpointer userdata);
LinkExtractor *link_extractor_create(const LinkTemplates *link_templates, const gchar *baseurl) {
LinkExtractor *extractor;
@@ -67,6 +69,7 @@ GPtrArray *link_extractor_get_links(LinkExtractor *self) {
err = tidyCleanAndRepair(tdoc);
if ( err >= 0 ) {
links = extract_links(self, tdoc);
+ remove_duplicate_urls(links);
}
}
@@ -147,6 +150,52 @@ void get_text_content(TidyDoc tdoc, TidyNode node, TidyBuffer* buf) {
}
}
+void remove_duplicate_urls(GPtrArray *links) {
+ /* Remove links with duplicated URLs. Keep the link with the longest
+ * title on the assumption that it is more informative. Keep the
+ * sort order. */
+
+ if (links->len <= 1)
+ return;
+
+ /* seen_urls maps an href (char *) to Link * which has the longest
+ * title. Both keys and values are borrowed references!
+ */
+ GHashTable *seen_urls = g_hash_table_new(g_str_hash, g_str_equal);
+ g_ptr_array_foreach(links, insert_links_with_longest_titles, seen_urls);
+
+ /* Delete links which are not in the hash table */
+ int i = 0;
+ while (i < links->len) {
+ Link *link = g_ptr_array_index(links, i);
+ const char *href = link_get_href(link);
+ Link *link_with_longest_title =
+ (Link *)g_hash_table_lookup(seen_urls, href);
+
+ if (link == link_with_longest_title) {
+ i++;
+ } else {
+ g_ptr_array_remove_index(links, i);
+ }
+ }
+
+ g_hash_table_unref(seen_urls);
+}
+
+void insert_links_with_longest_titles(gpointer data, gpointer userdata) {
+ Link *link = (Link *)data;
+ GHashTable *hashtable = (GHashTable *)userdata;
+ int title_len = strlen(link_get_title(link));
+ const char *href = link_get_href(link);
+ Link *prev_link = (Link *)g_hash_table_lookup(hashtable, href);
+ int prev_link_title_len = 0;
+ if (prev_link)
+ prev_link_title_len = strlen(link_get_title(prev_link));
+ if (!prev_link || (prev_link_title_len < title_len)) {
+ g_hash_table_replace(hashtable, (gpointer)href, (gpointer)link);
+ }
+}
+
void free_link(gpointer p) {
link_delete((Link *)p);
}
diff --git a/tests/libwebvi_tests.c b/tests/libwebvi_tests.c
index 4d57a94..f7975af 100644
--- a/tests/libwebvi_tests.c
+++ b/tests/libwebvi_tests.c
@@ -45,6 +45,10 @@ int main(int argc, char** argv)
link_extractor_fixture_setup,
test_link_extractor_title_overrides_content,
link_extractor_fixture_teardown);
+ g_test_add("/linkextractor/html_title", LinkExtractorFixture, 0,
+ link_extractor_fixture_setup,
+ test_link_extractor_remove_duplicates,
+ link_extractor_fixture_teardown);
g_test_add_func("/menubuilder/mainmenu", test_mainmenu);
g_test_add("/menubuilder/title", MenuBuilderFixture, 0,
diff --git a/tests/linkextractor_tests.c b/tests/linkextractor_tests.c
index 60deba0..7fe9afc 100644
--- a/tests/linkextractor_tests.c
+++ b/tests/linkextractor_tests.c
@@ -44,6 +44,17 @@
"<a href=\"" HTML6_HREF "\" title=\"" HTML6_TITLE "\">ignored</a>" \
"</body></html>"
+#define HTML_DUPLICATE_HREF1 "http://example.com/test/1"
+#define HTML_DUPLICATE_TITLE1 "First link"
+#define HTML_DUPLICATE_HREF2 "http://example.com/test/2"
+#define HTML_DUPLICATE_TITLE2 "Second link"
+#define HTML_DUPLICATE_LINKS "<html><body>" \
+ "<a href=\"" HTML_DUPLICATE_HREF1 "\">d1</a>" \
+ "<a href=\"" HTML_DUPLICATE_HREF1 "\">" HTML_DUPLICATE_TITLE1 "</a>" \
+ "<a href=\"" HTML_DUPLICATE_HREF2 "\">" HTML_DUPLICATE_TITLE2 "</a>" \
+ "<a href=\"" HTML_DUPLICATE_HREF2 "\">d2</a>" \
+ "</body></html>"
+
void link_extractor_fixture_setup(LinkExtractorFixture *fixture,
gconstpointer test_data) {
@@ -175,3 +186,34 @@ void test_link_extractor_title_overrides_content(
g_assert(strcmp(title, HTML6_TITLE) == 0);
g_ptr_array_free(links, TRUE);
}
+
+void test_link_extractor_remove_duplicates(LinkExtractorFixture *fixture,
+ gconstpointer test_data)
+{
+ link_extractor_append(fixture->extractor, HTML_DUPLICATE_LINKS,
+ strlen(HTML_DUPLICATE_LINKS));
+ GPtrArray *links = link_extractor_get_links(fixture->extractor);
+ g_assert(links);
+ g_assert(links->len == 2);
+
+ const struct Link *link;
+ const char *href;
+ const char *title;
+ link = g_ptr_array_index(links, 0);
+ href = link_get_href(link);
+ g_assert(href);
+ g_assert(strcmp(href, HTML_DUPLICATE_HREF1) == 0);
+ title = link_get_title(link);
+ g_assert(title);
+ g_assert(strcmp(title, HTML_DUPLICATE_TITLE1) == 0);
+
+ link = g_ptr_array_index(links, 1);
+ href = link_get_href(link);
+ g_assert(href);
+ g_assert(strcmp(href, HTML_DUPLICATE_HREF2) == 0);
+ title = link_get_title(link);
+ g_assert(title);
+ g_assert(strcmp(title, HTML_DUPLICATE_TITLE2) == 0);
+
+ g_ptr_array_free(links, TRUE);
+}
diff --git a/tests/linkextractor_tests.h b/tests/linkextractor_tests.h
index 62cc52f..cea6e2d 100644
--- a/tests/linkextractor_tests.h
+++ b/tests/linkextractor_tests.h
@@ -28,5 +28,7 @@ void test_link_extractor_html_title(LinkExtractorFixture *fixture,
gconstpointer test_data);
void test_link_extractor_title_overrides_content(LinkExtractorFixture *fixture,
gconstpointer test_data);
+void test_link_extractor_remove_duplicates(LinkExtractorFixture *fixture,
+ gconstpointer test_data);
#endif // LINK_EXTRACTOR_TESTS_H