crawler.c 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. /***************************************************************************
  2. * _ _ ____ _
  3. * Project ___| | | | _ \| |
  4. * / __| | | | |_) | |
  5. * | (__| |_| | _ <| |___
  6. * \___|\___/|_| \_\_____|
  7. *
  8. * Web crawler based on curl and libxml2.
  9. * Copyright (C) 2018 Jeroen Ooms <jeroenooms@gmail.com>
  10. * License: MIT
  11. *
  12. * To compile:
  13. * gcc crawler.c $(pkg-config --cflags --libs libxml-2.0 libcurl)
  14. *
  15. */
  16. /* <DESC>
  17. * Web crawler based on curl and libxml2 to stress-test curl with
  18. * hundreds of concurrent connections to various servers.
  19. * </DESC>
  20. */
  21. /* Parameters */
  22. int max_con = 200;
  23. int max_total = 20000;
  24. int max_requests = 500;
  25. int max_link_per_page = 5;
  26. int follow_relative_links = 0;
  27. char *start_page = "https://www.reuters.com";
  28. #include <libxml/HTMLparser.h>
  29. #include <libxml/xpath.h>
  30. #include <libxml/uri.h>
  31. #include <curl/curl.h>
  32. #include <stdlib.h>
  33. #include <string.h>
  34. #include <math.h>
  35. #include <signal.h>
  36. int pending_interrupt = 0;
  37. void sighandler(int dummy)
  38. {
  39. pending_interrupt = 1;
  40. }
  41. /* resizable buffer */
  42. typedef struct {
  43. char *buf;
  44. size_t size;
  45. } memory;
  46. size_t grow_buffer(void *contents, size_t sz, size_t nmemb, void *ctx)
  47. {
  48. size_t realsize = sz * nmemb;
  49. memory *mem = (memory*) ctx;
  50. mem->buf = realloc(mem->buf, mem->size + realsize);
  51. memcpy(&(mem->buf[mem->size]), contents, realsize);
  52. mem->size += realsize;
  53. return realsize;
  54. }
  55. CURL *make_handle(char *url)
  56. {
  57. CURL *handle = curl_easy_init();
  58. /* Important: use HTTP2 over HTTPS */
  59. curl_easy_setopt(handle, CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_2TLS);
  60. curl_easy_setopt(handle, CURLOPT_URL, url);
  61. /* buffer body */
  62. memory *mem = malloc(sizeof(memory));
  63. mem->size = 0;
  64. mem->buf = malloc(1);
  65. curl_easy_setopt(handle, CURLOPT_WRITEFUNCTION, grow_buffer);
  66. curl_easy_setopt(handle, CURLOPT_WRITEDATA, mem);
  67. curl_easy_setopt(handle, CURLOPT_PRIVATE, mem);
  68. /* For completeness */
  69. curl_easy_setopt(handle, CURLOPT_ENCODING, "gzip, deflate");
  70. curl_easy_setopt(handle, CURLOPT_TIMEOUT, 5L);
  71. curl_easy_setopt(handle, CURLOPT_FOLLOWLOCATION, 1L);
  72. curl_easy_setopt(handle, CURLOPT_MAXREDIRS, 10L);
  73. curl_easy_setopt(handle, CURLOPT_CONNECTTIMEOUT, 2L);
  74. curl_easy_setopt(handle, CURLOPT_COOKIEFILE, "");
  75. curl_easy_setopt(handle, CURLOPT_FILETIME, 1L);
  76. curl_easy_setopt(handle, CURLOPT_USERAGENT, "mini crawler");
  77. curl_easy_setopt(handle, CURLOPT_HTTPAUTH, CURLAUTH_ANY);
  78. curl_easy_setopt(handle, CURLOPT_UNRESTRICTED_AUTH, 1L);
  79. curl_easy_setopt(handle, CURLOPT_PROXYAUTH, CURLAUTH_ANY);
  80. curl_easy_setopt(handle, CURLOPT_EXPECT_100_TIMEOUT_MS, 0L);
  81. return handle;
  82. }
  83. /* HREF finder implemented in libxml2 but could be any HTML parser */
  84. size_t follow_links(CURLM *multi_handle, memory *mem, char *url)
  85. {
  86. int opts = HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | \
  87. HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
  88. htmlDocPtr doc = htmlReadMemory(mem->buf, mem->size, url, NULL, opts);
  89. if(!doc)
  90. return 0;
  91. xmlChar *xpath = (xmlChar*) "//a/@href";
  92. xmlXPathContextPtr context = xmlXPathNewContext(doc);
  93. xmlXPathObjectPtr result = xmlXPathEvalExpression(xpath, context);
  94. xmlXPathFreeContext(context);
  95. if(!result)
  96. return 0;
  97. xmlNodeSetPtr nodeset = result->nodesetval;
  98. if(xmlXPathNodeSetIsEmpty(nodeset)) {
  99. xmlXPathFreeObject(result);
  100. return 0;
  101. }
  102. size_t count = 0;
  103. for(int i = 0; i < nodeset->nodeNr; i++) {
  104. double r = rand();
  105. int x = r * nodeset->nodeNr / RAND_MAX;
  106. const xmlNode *node = nodeset->nodeTab[x]->xmlChildrenNode;
  107. xmlChar *href = xmlNodeListGetString(doc, node, 1);
  108. if(follow_relative_links) {
  109. xmlChar *orig = href;
  110. href = xmlBuildURI(href, (xmlChar *) url);
  111. xmlFree(orig);
  112. }
  113. char *link = (char *) href;
  114. if(!link || strlen(link) < 20)
  115. continue;
  116. if(!strncmp(link, "http://", 7) || !strncmp(link, "https://", 8)) {
  117. curl_multi_add_handle(multi_handle, make_handle(link));
  118. if(count++ == max_link_per_page)
  119. break;
  120. }
  121. xmlFree(link);
  122. }
  123. xmlXPathFreeObject(result);
  124. return count;
  125. }
  126. int is_html(char *ctype)
  127. {
  128. return ctype != NULL && strlen(ctype) > 10 && strstr(ctype, "text/html");
  129. }
  130. int main(void)
  131. {
  132. signal(SIGINT, sighandler);
  133. LIBXML_TEST_VERSION;
  134. curl_global_init(CURL_GLOBAL_DEFAULT);
  135. CURLM *multi_handle = curl_multi_init();
  136. curl_multi_setopt(multi_handle, CURLMOPT_MAX_TOTAL_CONNECTIONS, max_con);
  137. curl_multi_setopt(multi_handle, CURLMOPT_MAX_HOST_CONNECTIONS, 6L);
  138. /* enables http/2 if available */
  139. #ifdef CURLPIPE_MULTIPLEX
  140. curl_multi_setopt(multi_handle, CURLMOPT_PIPELINING, CURLPIPE_MULTIPLEX);
  141. #endif
  142. /* sets html start page */
  143. curl_multi_add_handle(multi_handle, make_handle(start_page));
  144. int msgs_left;
  145. int pending = 0;
  146. int complete = 0;
  147. int still_running = 1;
  148. while(still_running && !pending_interrupt) {
  149. int numfds;
  150. curl_multi_wait(multi_handle, NULL, 0, 1000, &numfds);
  151. curl_multi_perform(multi_handle, &still_running);
  152. /* See how the transfers went */
  153. CURLMsg *m = NULL;
  154. while((m = curl_multi_info_read(multi_handle, &msgs_left))) {
  155. if(m->msg == CURLMSG_DONE) {
  156. CURL *handle = m->easy_handle;
  157. char *url;
  158. memory *mem;
  159. curl_easy_getinfo(handle, CURLINFO_PRIVATE, &mem);
  160. curl_easy_getinfo(handle, CURLINFO_EFFECTIVE_URL, &url);
  161. if(m->data.result == CURLE_OK) {
  162. long res_status;
  163. curl_easy_getinfo(handle, CURLINFO_RESPONSE_CODE, &res_status);
  164. if(res_status == 200) {
  165. char *ctype;
  166. curl_easy_getinfo(handle, CURLINFO_CONTENT_TYPE, &ctype);
  167. printf("[%d] HTTP 200 (%s): %s\n", complete, ctype, url);
  168. if(is_html(ctype) && mem->size > 100) {
  169. if(pending < max_requests && (complete + pending) < max_total) {
  170. pending += follow_links(multi_handle, mem, url);
  171. still_running = 1;
  172. }
  173. }
  174. }
  175. else {
  176. printf("[%d] HTTP %d: %s\n", complete, (int) res_status, url);
  177. }
  178. }
  179. else {
  180. printf("[%d] Connection failure: %s\n", complete, url);
  181. }
  182. curl_multi_remove_handle(multi_handle, handle);
  183. curl_easy_cleanup(handle);
  184. free(mem->buf);
  185. free(mem);
  186. complete++;
  187. pending--;
  188. }
  189. }
  190. }
  191. curl_multi_cleanup(multi_handle);
  192. curl_global_cleanup();
  193. return 0;
  194. }