pcap-rdmasniff.c 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. /*
  2. * Copyright (c) 2017 Pure Storage, Inc.
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions
  7. * are met:
  8. *
  9. * 1. Redistributions of source code must retain the above copyright
  10. * notice, this list of conditions and the following disclaimer.
  11. * 2. Redistributions in binary form must reproduce the above copyright
  12. * notice, this list of conditions and the following disclaimer in the
  13. * documentation and/or other materials provided with the distribution.
  14. * 3. The name of the author may not be used to endorse or promote
  15. * products derived from this software without specific prior written
  16. * permission.
  17. *
  18. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19. * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20. * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21. * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22. * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25. * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26. * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27. * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29. */
  30. #ifdef HAVE_CONFIG_H
  31. #include "config.h"
  32. #endif
  33. #include "pcap-int.h"
  34. #include "pcap-rdmasniff.h"
  35. #include <infiniband/verbs.h>
  36. #include <stdlib.h>
  37. #include <string.h>
  38. #include <sys/time.h>
  39. #if !defined(IBV_FLOW_ATTR_SNIFFER)
  40. #define IBV_FLOW_ATTR_SNIFFER 3
  41. #endif
  42. static const int RDMASNIFF_NUM_RECEIVES = 128;
  43. static const int RDMASNIFF_RECEIVE_SIZE = 10000;
  44. struct pcap_rdmasniff {
  45. struct ibv_device * rdma_device;
  46. struct ibv_context * context;
  47. struct ibv_comp_channel * channel;
  48. struct ibv_pd * pd;
  49. struct ibv_cq * cq;
  50. struct ibv_qp * qp;
  51. struct ibv_flow * flow;
  52. struct ibv_mr * mr;
  53. u_char * oneshot_buffer;
  54. unsigned port_num;
  55. int cq_event;
  56. u_int packets_recv;
  57. };
  58. static int
  59. rdmasniff_stats(pcap_t *handle, struct pcap_stat *stat)
  60. {
  61. struct pcap_rdmasniff *priv = handle->priv;
  62. stat->ps_recv = priv->packets_recv;
  63. stat->ps_drop = 0;
  64. stat->ps_ifdrop = 0;
  65. return 0;
  66. }
  67. static void
  68. rdmasniff_cleanup(pcap_t *handle)
  69. {
  70. struct pcap_rdmasniff *priv = handle->priv;
  71. ibv_dereg_mr(priv->mr);
  72. ibv_destroy_flow(priv->flow);
  73. ibv_destroy_qp(priv->qp);
  74. ibv_destroy_cq(priv->cq);
  75. ibv_dealloc_pd(priv->pd);
  76. ibv_destroy_comp_channel(priv->channel);
  77. ibv_close_device(priv->context);
  78. free(priv->oneshot_buffer);
  79. pcap_cleanup_live_common(handle);
  80. }
  81. static void
  82. rdmasniff_post_recv(pcap_t *handle, uint64_t wr_id)
  83. {
  84. struct pcap_rdmasniff *priv = handle->priv;
  85. struct ibv_sge sg_entry;
  86. struct ibv_recv_wr wr, *bad_wr;
  87. sg_entry.length = RDMASNIFF_RECEIVE_SIZE;
  88. sg_entry.addr = (uintptr_t) handle->buffer + RDMASNIFF_RECEIVE_SIZE * wr_id;
  89. sg_entry.lkey = priv->mr->lkey;
  90. wr.wr_id = wr_id;
  91. wr.num_sge = 1;
  92. wr.sg_list = &sg_entry;
  93. wr.next = NULL;
  94. ibv_post_recv(priv->qp, &wr, &bad_wr);
  95. }
  96. static int
  97. rdmasniff_read(pcap_t *handle, int max_packets, pcap_handler callback, u_char *user)
  98. {
  99. struct pcap_rdmasniff *priv = handle->priv;
  100. struct ibv_cq *ev_cq;
  101. void *ev_ctx;
  102. struct ibv_wc wc;
  103. struct pcap_pkthdr pkth;
  104. u_char *pktd;
  105. int count = 0;
  106. if (!priv->cq_event) {
  107. while (ibv_get_cq_event(priv->channel, &ev_cq, &ev_ctx) < 0) {
  108. if (errno != EINTR) {
  109. return PCAP_ERROR;
  110. }
  111. if (handle->break_loop) {
  112. handle->break_loop = 0;
  113. return PCAP_ERROR_BREAK;
  114. }
  115. }
  116. ibv_ack_cq_events(priv->cq, 1);
  117. ibv_req_notify_cq(priv->cq, 0);
  118. priv->cq_event = 1;
  119. }
  120. while (count < max_packets || PACKET_COUNT_IS_UNLIMITED(max_packets)) {
  121. if (ibv_poll_cq(priv->cq, 1, &wc) != 1) {
  122. priv->cq_event = 0;
  123. break;
  124. }
  125. if (wc.status != IBV_WC_SUCCESS) {
  126. fprintf(stderr, "failed WC wr_id %lld status %d/%s\n",
  127. (unsigned long long) wc.wr_id,
  128. wc.status, ibv_wc_status_str(wc.status));
  129. continue;
  130. }
  131. pkth.len = wc.byte_len;
  132. pkth.caplen = min(pkth.len, (u_int)handle->snapshot);
  133. gettimeofday(&pkth.ts, NULL);
  134. pktd = (u_char *) handle->buffer + wc.wr_id * RDMASNIFF_RECEIVE_SIZE;
  135. if (handle->fcode.bf_insns == NULL ||
  136. bpf_filter(handle->fcode.bf_insns, pktd, pkth.len, pkth.caplen)) {
  137. callback(user, &pkth, pktd);
  138. ++priv->packets_recv;
  139. ++count;
  140. }
  141. rdmasniff_post_recv(handle, wc.wr_id);
  142. if (handle->break_loop) {
  143. handle->break_loop = 0;
  144. return PCAP_ERROR_BREAK;
  145. }
  146. }
  147. return count;
  148. }
  149. static void
  150. rdmasniff_oneshot(u_char *user, const struct pcap_pkthdr *h, const u_char *bytes)
  151. {
  152. struct oneshot_userdata *sp = (struct oneshot_userdata *) user;
  153. pcap_t *handle = sp->pd;
  154. struct pcap_rdmasniff *priv = handle->priv;
  155. *sp->hdr = *h;
  156. memcpy(priv->oneshot_buffer, bytes, h->caplen);
  157. *sp->pkt = priv->oneshot_buffer;
  158. }
  159. static int
  160. rdmasniff_activate(pcap_t *handle)
  161. {
  162. struct pcap_rdmasniff *priv = handle->priv;
  163. struct ibv_qp_init_attr qp_init_attr;
  164. struct ibv_qp_attr qp_attr;
  165. struct ibv_flow_attr flow_attr;
  166. struct ibv_port_attr port_attr;
  167. int i;
  168. priv->context = ibv_open_device(priv->rdma_device);
  169. if (!priv->context) {
  170. pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
  171. "Failed to open device %s", handle->opt.device);
  172. goto error;
  173. }
  174. priv->pd = ibv_alloc_pd(priv->context);
  175. if (!priv->pd) {
  176. pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
  177. "Failed to alloc PD for device %s", handle->opt.device);
  178. goto error;
  179. }
  180. priv->channel = ibv_create_comp_channel(priv->context);
  181. if (!priv->channel) {
  182. pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
  183. "Failed to create comp channel for device %s", handle->opt.device);
  184. goto error;
  185. }
  186. priv->cq = ibv_create_cq(priv->context, RDMASNIFF_NUM_RECEIVES,
  187. NULL, priv->channel, 0);
  188. if (!priv->cq) {
  189. pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
  190. "Failed to create CQ for device %s", handle->opt.device);
  191. goto error;
  192. }
  193. ibv_req_notify_cq(priv->cq, 0);
  194. memset(&qp_init_attr, 0, sizeof qp_init_attr);
  195. qp_init_attr.send_cq = qp_init_attr.recv_cq = priv->cq;
  196. qp_init_attr.cap.max_recv_wr = RDMASNIFF_NUM_RECEIVES;
  197. qp_init_attr.cap.max_recv_sge = 1;
  198. qp_init_attr.qp_type = IBV_QPT_RAW_PACKET;
  199. priv->qp = ibv_create_qp(priv->pd, &qp_init_attr);
  200. if (!priv->qp) {
  201. pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
  202. "Failed to create QP for device %s", handle->opt.device);
  203. goto error;
  204. }
  205. memset(&qp_attr, 0, sizeof qp_attr);
  206. qp_attr.qp_state = IBV_QPS_INIT;
  207. qp_attr.port_num = priv->port_num;
  208. if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE | IBV_QP_PORT)) {
  209. pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
  210. "Failed to modify QP to INIT for device %s", handle->opt.device);
  211. goto error;
  212. }
  213. memset(&qp_attr, 0, sizeof qp_attr);
  214. qp_attr.qp_state = IBV_QPS_RTR;
  215. if (ibv_modify_qp(priv->qp, &qp_attr, IBV_QP_STATE)) {
  216. pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
  217. "Failed to modify QP to RTR for device %s", handle->opt.device);
  218. goto error;
  219. }
  220. memset(&flow_attr, 0, sizeof flow_attr);
  221. flow_attr.type = IBV_FLOW_ATTR_SNIFFER;
  222. flow_attr.size = sizeof flow_attr;
  223. flow_attr.port = priv->port_num;
  224. priv->flow = ibv_create_flow(priv->qp, &flow_attr);
  225. if (!priv->flow) {
  226. pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
  227. "Failed to create flow for device %s", handle->opt.device);
  228. goto error;
  229. }
  230. handle->bufsize = RDMASNIFF_NUM_RECEIVES * RDMASNIFF_RECEIVE_SIZE;
  231. handle->buffer = malloc(handle->bufsize);
  232. if (!handle->buffer) {
  233. pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
  234. "Failed to allocate receive buffer for device %s", handle->opt.device);
  235. goto error;
  236. }
  237. priv->oneshot_buffer = malloc(RDMASNIFF_RECEIVE_SIZE);
  238. if (!priv->oneshot_buffer) {
  239. pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
  240. "Failed to allocate oneshot buffer for device %s", handle->opt.device);
  241. goto error;
  242. }
  243. priv->mr = ibv_reg_mr(priv->pd, handle->buffer, handle->bufsize, IBV_ACCESS_LOCAL_WRITE);
  244. if (!priv->mr) {
  245. pcap_snprintf(handle->errbuf, PCAP_ERRBUF_SIZE,
  246. "Failed to register MR for device %s", handle->opt.device);
  247. goto error;
  248. }
  249. for (i = 0; i < RDMASNIFF_NUM_RECEIVES; ++i) {
  250. rdmasniff_post_recv(handle, i);
  251. }
  252. if (!ibv_query_port(priv->context, priv->port_num, &port_attr) &&
  253. port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
  254. handle->linktype = DLT_INFINIBAND;
  255. } else {
  256. handle->linktype = DLT_EN10MB;
  257. }
  258. if (handle->snapshot <= 0 || handle->snapshot > RDMASNIFF_RECEIVE_SIZE)
  259. handle->snapshot = RDMASNIFF_RECEIVE_SIZE;
  260. handle->offset = 0;
  261. handle->read_op = rdmasniff_read;
  262. handle->stats_op = rdmasniff_stats;
  263. handle->cleanup_op = rdmasniff_cleanup;
  264. handle->setfilter_op = install_bpf_program;
  265. handle->setdirection_op = NULL;
  266. handle->set_datalink_op = NULL;
  267. handle->getnonblock_op = pcap_getnonblock_fd;
  268. handle->setnonblock_op = pcap_setnonblock_fd;
  269. handle->oneshot_callback = rdmasniff_oneshot;
  270. handle->selectable_fd = priv->channel->fd;
  271. return 0;
  272. error:
  273. if (priv->mr) {
  274. ibv_dereg_mr(priv->mr);
  275. }
  276. if (priv->flow) {
  277. ibv_destroy_flow(priv->flow);
  278. }
  279. if (priv->qp) {
  280. ibv_destroy_qp(priv->qp);
  281. }
  282. if (priv->cq) {
  283. ibv_destroy_cq(priv->cq);
  284. }
  285. if (priv->channel) {
  286. ibv_destroy_comp_channel(priv->channel);
  287. }
  288. if (priv->pd) {
  289. ibv_dealloc_pd(priv->pd);
  290. }
  291. if (priv->context) {
  292. ibv_close_device(priv->context);
  293. }
  294. if (priv->oneshot_buffer) {
  295. free(priv->oneshot_buffer);
  296. }
  297. return PCAP_ERROR;
  298. }
  299. pcap_t *
  300. rdmasniff_create(const char *device, char *ebuf, int *is_ours)
  301. {
  302. struct pcap_rdmasniff *priv;
  303. struct ibv_device **dev_list;
  304. int numdev;
  305. size_t namelen;
  306. const char *port;
  307. unsigned port_num;
  308. int i;
  309. pcap_t *p = NULL;
  310. *is_ours = 0;
  311. dev_list = ibv_get_device_list(&numdev);
  312. if (!dev_list || !numdev) {
  313. return NULL;
  314. }
  315. namelen = strlen(device);
  316. port = strchr(device, ':');
  317. if (port) {
  318. port_num = strtoul(port + 1, NULL, 10);
  319. if (port_num > 0) {
  320. namelen = port - device;
  321. } else {
  322. port_num = 1;
  323. }
  324. } else {
  325. port_num = 1;
  326. }
  327. for (i = 0; i < numdev; ++i) {
  328. if (strlen(dev_list[i]->name) == namelen &&
  329. !strncmp(device, dev_list[i]->name, namelen)) {
  330. *is_ours = 1;
  331. p = pcap_create_common(ebuf, sizeof (struct pcap_rdmasniff));
  332. if (p) {
  333. p->activate_op = rdmasniff_activate;
  334. priv = p->priv;
  335. priv->rdma_device = dev_list[i];
  336. priv->port_num = port_num;
  337. }
  338. break;
  339. }
  340. }
  341. ibv_free_device_list(dev_list);
  342. return p;
  343. }
  344. int
  345. rdmasniff_findalldevs(pcap_if_list_t *devlistp, char *err_str)
  346. {
  347. struct ibv_device **dev_list;
  348. int numdev;
  349. int i;
  350. int ret = 0;
  351. dev_list = ibv_get_device_list(&numdev);
  352. if (!dev_list || !numdev) {
  353. return 0;
  354. }
  355. for (i = 0; i < numdev; ++i) {
  356. /*
  357. * XXX - do the notions of "up", "running", or
  358. * "connected" apply here?
  359. */
  360. if (!add_dev(devlistp, dev_list[i]->name, 0, "RDMA sniffer", err_str)) {
  361. ret = -1;
  362. goto out;
  363. }
  364. }
  365. out:
  366. ibv_free_device_list(dev_list);
  367. return ret;
  368. }