diff options
author | Antti Ajanki <antti.ajanki@iki.fi> | 2013-08-06 18:05:44 +0300 |
---|---|---|
committer | Antti Ajanki <antti.ajanki@iki.fi> | 2013-08-06 18:05:44 +0300 |
commit | 27b116a64069ba0c0d734073fd33b5fb67958024 (patch) | |
tree | d96f7be107b119467aa9819a16db9b9bf8863fc7 | |
parent | 1ea55bb8190e782940ff893ac8d492acabbbc886 (diff) | |
download | vdr-plugin-webvideo-27b116a64069ba0c0d734073fd33b5fb67958024.tar.gz vdr-plugin-webvideo-27b116a64069ba0c0d734073fd33b5fb67958024.tar.bz2 |
Hide duplicate links
-rw-r--r-- | src/libwebvi/linkextractor.c | 49 | ||||
-rw-r--r-- | tests/libwebvi_tests.c | 4 | ||||
-rw-r--r-- | tests/linkextractor_tests.c | 42 | ||||
-rw-r--r-- | tests/linkextractor_tests.h | 2 |
4 files changed, 97 insertions, 0 deletions
diff --git a/src/libwebvi/linkextractor.c b/src/libwebvi/linkextractor.c index c11b926..b0e4270 100644 --- a/src/libwebvi/linkextractor.c +++ b/src/libwebvi/linkextractor.c @@ -27,6 +27,8 @@ static void get_links_recursively(TidyDoc tdoc, GPtrArray *links_found); static gchar *parse_link_title(TidyDoc tdoc, TidyNode node); static void get_text_content(TidyDoc tdoc, TidyNode node, TidyBuffer* buf); +static void remove_duplicate_urls(GPtrArray *links); +static void insert_links_with_longest_titles(gpointer data, gpointer userdata); LinkExtractor *link_extractor_create(const LinkTemplates *link_templates, const gchar *baseurl) { LinkExtractor *extractor; @@ -67,6 +69,7 @@ GPtrArray *link_extractor_get_links(LinkExtractor *self) { err = tidyCleanAndRepair(tdoc); if ( err >= 0 ) { links = extract_links(self, tdoc); + remove_duplicate_urls(links); } } @@ -147,6 +150,52 @@ void get_text_content(TidyDoc tdoc, TidyNode node, TidyBuffer* buf) { } } +void remove_duplicate_urls(GPtrArray *links) { + /* Remove links with duplicated URLs. Keep the link with the longest + * title on the assumption that it is more informative. Keep the + * sort order. */ + + if (links->len <= 1) + return; + + /* seen_urls maps an href (char *) to Link * which has the longest + * title. Both keys and values are borrowed references! + */ + GHashTable *seen_urls = g_hash_table_new(g_str_hash, g_str_equal); + g_ptr_array_foreach(links, insert_links_with_longest_titles, seen_urls); + + /* Delete links which are not in the hash table */ + int i = 0; + while (i < links->len) { + Link *link = g_ptr_array_index(links, i); + const char *href = link_get_href(link); + Link *link_with_longest_title = + (Link *)g_hash_table_lookup(seen_urls, href); + + if (link == link_with_longest_title) { + i++; + } else { + g_ptr_array_remove_index(links, i); + } + } + + g_hash_table_unref(seen_urls); +} + +void insert_links_with_longest_titles(gpointer data, gpointer userdata) { + Link *link = (Link *)data; + GHashTable *hashtable = (GHashTable *)userdata; + int title_len = strlen(link_get_title(link)); + const char *href = link_get_href(link); + Link *prev_link = (Link *)g_hash_table_lookup(hashtable, href); + int prev_link_title_len = 0; + if (prev_link) + prev_link_title_len = strlen(link_get_title(prev_link)); + if (!prev_link || (prev_link_title_len < title_len)) { + g_hash_table_replace(hashtable, (gpointer)href, (gpointer)link); + } +} + void free_link(gpointer p) { link_delete((Link *)p); } diff --git a/tests/libwebvi_tests.c b/tests/libwebvi_tests.c index 4d57a94..f7975af 100644 --- a/tests/libwebvi_tests.c +++ b/tests/libwebvi_tests.c @@ -45,6 +45,10 @@ int main(int argc, char** argv) link_extractor_fixture_setup, test_link_extractor_title_overrides_content, link_extractor_fixture_teardown); + g_test_add("/linkextractor/html_title", LinkExtractorFixture, 0, + link_extractor_fixture_setup, + test_link_extractor_remove_duplicates, + link_extractor_fixture_teardown); g_test_add_func("/menubuilder/mainmenu", test_mainmenu); g_test_add("/menubuilder/title", MenuBuilderFixture, 0, diff --git a/tests/linkextractor_tests.c b/tests/linkextractor_tests.c index 60deba0..7fe9afc 100644 --- a/tests/linkextractor_tests.c +++ b/tests/linkextractor_tests.c @@ -44,6 +44,17 @@ "<a href=\"" HTML6_HREF "\" title=\"" HTML6_TITLE "\">ignored</a>" \ "</body></html>" +#define HTML_DUPLICATE_HREF1 "http://example.com/test/1" +#define HTML_DUPLICATE_TITLE1 "First link" +#define HTML_DUPLICATE_HREF2 "http://example.com/test/2" +#define HTML_DUPLICATE_TITLE2 "Second link" +#define HTML_DUPLICATE_LINKS "<html><body>" \ + "<a href=\"" HTML_DUPLICATE_HREF1 "\">d1</a>" \ + "<a href=\"" HTML_DUPLICATE_HREF1 "\">" HTML_DUPLICATE_TITLE1 "</a>" \ + "<a href=\"" HTML_DUPLICATE_HREF2 "\">" HTML_DUPLICATE_TITLE2 "</a>" \ + "<a href=\"" HTML_DUPLICATE_HREF2 "\">d2</a>" \ + "</body></html>" + void link_extractor_fixture_setup(LinkExtractorFixture *fixture, gconstpointer test_data) { @@ -175,3 +186,34 @@ void test_link_extractor_title_overrides_content( g_assert(strcmp(title, HTML6_TITLE) == 0); g_ptr_array_free(links, TRUE); } + +void test_link_extractor_remove_duplicates(LinkExtractorFixture *fixture, + gconstpointer test_data) +{ + link_extractor_append(fixture->extractor, HTML_DUPLICATE_LINKS, + strlen(HTML_DUPLICATE_LINKS)); + GPtrArray *links = link_extractor_get_links(fixture->extractor); + g_assert(links); + g_assert(links->len == 2); + + const struct Link *link; + const char *href; + const char *title; + link = g_ptr_array_index(links, 0); + href = link_get_href(link); + g_assert(href); + g_assert(strcmp(href, HTML_DUPLICATE_HREF1) == 0); + title = link_get_title(link); + g_assert(title); + g_assert(strcmp(title, HTML_DUPLICATE_TITLE1) == 0); + + link = g_ptr_array_index(links, 1); + href = link_get_href(link); + g_assert(href); + g_assert(strcmp(href, HTML_DUPLICATE_HREF2) == 0); + title = link_get_title(link); + g_assert(title); + g_assert(strcmp(title, HTML_DUPLICATE_TITLE2) == 0); + + g_ptr_array_free(links, TRUE); +} diff --git a/tests/linkextractor_tests.h b/tests/linkextractor_tests.h index 62cc52f..cea6e2d 100644 --- a/tests/linkextractor_tests.h +++ b/tests/linkextractor_tests.h @@ -28,5 +28,7 @@ void test_link_extractor_html_title(LinkExtractorFixture *fixture, gconstpointer test_data); void test_link_extractor_title_overrides_content(LinkExtractorFixture *fixture, gconstpointer test_data); +void test_link_extractor_remove_duplicates(LinkExtractorFixture *fixture, + gconstpointer test_data); #endif // LINK_EXTRACTOR_TESTS_H |