diff --git a/README.md b/README.md index cb4d779..db75db9 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,9 @@ Usage: Path MTU Daemon is captures and broadcasts ICMP messages related to MTU detection. It listens on an interface, waiting for ICMP messages (IPv4 type 3 code 4 or IPv6 type 2 code 0) and it forwards them -verbatim to the broadcast ethernet address. +verbatim normally to the broadcast ethernet address. If a list of peers +is given then ICMP messages are forwarded using normal routing to these +peers enabling distribution across different subnets. Options: @@ -31,6 +33,8 @@ Options: --ports Forward only ICMP packets with payload containing L4 source port on this list (comma separated) + --peers Resend ICMP packets to this peer list + (comma separated) --help Print this message Example: @@ -46,7 +50,8 @@ Once again, it listens waiting for packets matching: (ether dst not ff:ff:ff:ff:ff:ff) And having appropriate length, and forwards them to ethernet broadcast -ff:ff:ff:ff:ff:ff. +ff:ff:ff:ff:ff:ff or using normal packet routing if a list of peers +is specified. To debug use tcpdump: diff --git a/src/main.c b/src/main.c index 144df31..825c54d 100644 --- a/src/main.c +++ b/src/main.c @@ -55,6 +55,9 @@ static void usage() " containing L4 source port on this " "list\n" " (comma separated)\n" + " --peers Resend ICMP packets to this peer " + "list\n" + " (comma separated)\n" " --help Print this message\n" "\n" "Example:\n" @@ -90,12 +93,15 @@ struct state pcap_t *pcap; struct nflog *nflog; int raw_sd; + int raw4; + int raw6; struct hashlimit *sources; struct hashlimit *ifaces; int verbose; int dry_run; int strict; uint64_t *ports_map; + void *peer_list; }; static int handle_packet(const uint8_t *p, unsigned data_len, void *userdata) @@ -105,6 +111,7 @@ static int handle_packet(const uint8_t *p, unsigned data_len, void *userdata) const char *reason = "unknown"; int mtu_of_next_hop = -1; int l4_sport = -1; + int ttl = -1; /* assumming DLT_EN10MB */ @@ -145,6 +152,7 @@ static int handle_packet(const uint8_t *p, unsigned data_len, void *userdata) valid = 1; hash = &p[l3_offset + 12]; hash_len = 4; + ttl = p[l3_offset + 8]; } } @@ -158,6 +166,7 @@ static int handle_packet(const uint8_t *p, unsigned data_len, void *userdata) valid = 1; hash = &p[l3_offset + 8]; hash_len = 16; + ttl = p[l3_offset + 7]; } } @@ -233,6 +242,13 @@ static int handle_packet(const uint8_t *p, unsigned data_len, void *userdata) } } + /* Check if this packet was received from a L3 peer */ + if (state->peer_list != NULL && + check_peerlist(state->peer_list, hash, hash_len) == 0) { + reason = "Received from L3 peer"; + goto reject; + } + uint8_t dst_mac[6]; memcpy(dst_mac, p, 6); @@ -275,10 +291,17 @@ static int handle_packet(const uint8_t *p, unsigned data_len, void *userdata) } if (state->dry_run == 0) { - int r = send(state->raw_sd, pp, data_len, 0); - /* ENOBUFS happens during IRQ storms okay to ignore */ - if (r < 0 && errno != ENOBUFS) { - PFATAL("send()"); + if (state->peer_list == NULL) { + int r = send(state->raw_sd, pp, data_len, 0); + /* ENOBUFS happens during IRQ storms okay to ignore */ + if (r < 0 && errno != ENOBUFS) { + PFATAL("send()"); + } + } else { + sendto_peerlist(state->peer_list, state->raw4, + state->raw6, hash_len, + p + icmp_offset, + data_len - icmp_offset, ttl); } } return 1; @@ -370,6 +393,7 @@ int main(int argc, char *argv[]) {"help", no_argument, 0, 'h'}, {"ports", required_argument, 0, 'p'}, {"strict", no_argument, 0, 't'}, + {"peers", required_argument, 0, 'P'}, {NULL, 0, 0, 0}}; const char *optstring = optstring_from_long_options(long_options); @@ -382,6 +406,7 @@ int main(int argc, char *argv[]) int dry_run = 0; int taskset_cpu = -1; uint64_t *ports_map = NULL; + struct peer *peer_list = NULL; int strict = 0; optind = 1; @@ -458,6 +483,16 @@ int main(int argc, char *argv[]) break; } + case 'P': { + const char **addresses = parse_argv(optarg, ','); + if (addresses[0] == NULL) { + FATAL("Warning peer list passed with -P was empty"); + } + peer_list = make_peerlist(addresses); + free(addresses); + break; + } + case 'v': verbose++; break; @@ -504,7 +539,13 @@ int main(int argc, char *argv[]) state.strict = strict; state.dry_run = dry_run; state.ports_map = ports_map; + state.peer_list = peer_list; state.raw_sd = setup_raw(iface); + state.raw4 = -1; + state.raw6 = -1; + if (peer_list != NULL) { + setup_rawipsocket(&state.raw4, &state.raw6); + } struct uevent uevent; uevent_new(&uevent); @@ -569,6 +610,7 @@ int main(int argc, char *argv[]) if (state.ports_map) { bitmap_free(state.ports_map); } + free_peerlist(state.peer_list); return 0; } diff --git a/src/net.c b/src/net.c index 8a87e3b..462c609 100644 --- a/src/net.c +++ b/src/net.c @@ -2,6 +2,7 @@ // // Copyright (c) 2015 CloudFlare, Inc. +#include #include #include #include @@ -12,11 +13,23 @@ #include #include #include +#include #include #include #include "pmtud.h" +#define MAX_PEERS 32 + +struct peer +{ + union { + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } sa; + socklen_t salen; +}; + pcap_t *setup_pcap(const char *iface, const char *bpf_filter, int snap_len, struct pcap_stat *stats) { @@ -179,3 +192,111 @@ const char *ip_to_string(const uint8_t *p, int p_len) } return dst; } + +void setup_rawipsocket(int *raw4, int *raw6) +{ + + *raw4 = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP); + if (*raw4 < 0) { + PFATAL("socket(AF_INET, SOCK_RAW, IPPROTO_ICMP)"); + } + *raw6 = socket(AF_INET6, SOCK_RAW, IPPROTO_ICMPV6); + if (*raw6 < 0) { + PFATAL("socket(AF_INET6, SOCK_RAW, IPPROTO_ICMPV6)"); + } +} + +struct peer *make_peerlist(const char **addresses) +{ + struct addrinfo hints; + struct addrinfo *result; + struct peer *peer_list; + int r; + int i; + + peer_list = calloc(MAX_PEERS, sizeof(struct peer)); + if (peer_list == NULL) { + PFATAL("malloc(peer_list)"); + } + + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_flags = AI_NUMERICHOST; + for (i = 0; addresses[0] != NULL; addresses++, i++) { + if (i >= MAX_PEERS) { + FATAL("Maximum number of peers exceeded %d", + MAX_PEERS); + } + r = getaddrinfo(addresses[0], NULL, &hints, &result); + if (r != 0) { + FATAL("Malformed peer address %s", addresses[0]); + } + if (result->ai_addrlen > sizeof(peer_list[i].sa)) { + FATAL("Internal error in address structures"); + } + memcpy(&peer_list[i].sa, result->ai_addr, result->ai_addrlen); + peer_list[i].salen = result->ai_addrlen; + freeaddrinfo(result); + } + return peer_list; +} + +void free_peerlist(struct peer *peer_list) +{ + free(peer_list); +} + +int check_peerlist(struct peer *peer_list, const uint8_t *p, int p_len) +{ + struct peer *peer; + + for (peer = peer_list; peer < peer_list + MAX_PEERS && peer->salen != 0; + peer++) { + if (p_len == 4 && peer->sa.sin.sin_family == AF_INET) { + if (memcmp(&peer->sa.sin.sin_addr, p, p_len) == 0) { + return 0; + } + } else if (p_len == 16 && peer->sa.sin6.sin6_family == AF_INET6) { + if (memcmp(&peer->sa.sin6.sin6_addr, p, p_len) == 0) { + return 0; + } + } + } + return -1; +} + +void sendto_peerlist(struct peer *peer_list, int raw4, int raw6, int addr_len, + const uint8_t *icmppkt, unsigned icmppkt_len, int orig_ttl) +{ + struct peer *peer; + int family; + int r; + int ttl; + + if (addr_len == 4) { + family = AF_INET; + } else if (addr_len == 16) { + family = AF_INET6; + } else { + FATAL("addr_len is neither 4 nor 16"); + } + + ttl = orig_ttl - 1; + if (ttl == 0) + return; + setsockopt(raw4, IPPROTO_IP, IP_TTL, &ttl, sizeof(ttl)); + setsockopt(raw6, IPPROTO_IPV6, IPV6_UNICAST_HOPS, &ttl, + sizeof(ttl)); + for (peer = peer_list; peer < peer_list + MAX_PEERS && peer->salen != 0; + peer++) { + if (peer->sa.sin.sin_family != family) { + continue; + } + r = sendto((family == AF_INET ? raw4 : raw6), + icmppkt, icmppkt_len, 0, (struct sockaddr *)&peer->sa, + peer->salen); + /* ENOBUFS happens during IRQ storms okay to ignore */ + if (r < 0 && errno != ENOBUFS) { + PFATAL("sendto()"); + } + } +} diff --git a/src/pmtud.h b/src/pmtud.h index 40df40c..639f73c 100644 --- a/src/pmtud.h +++ b/src/pmtud.h @@ -43,11 +43,18 @@ int signal_desc(int signal); const char **parse_argv(const char *str, char delim); /* pcap.c */ +struct peer; pcap_t *setup_pcap(const char *iface, const char *bpf_filter, int snap_len, struct pcap_stat *stats); void unsetup_pcap(pcap_t *pcap, const char *iface, struct pcap_stat *stats); int setup_raw(const char *iface); const char *ip_to_string(const uint8_t *p, int p_len); +void setup_rawipsocket(int *raw4, int *raw6); +struct peer *make_peerlist(const char **addresses); +void free_peerlist(struct peer *peer_list); +int check_peerlist(struct peer *peer_list, const uint8_t *p, int p_len); +void sendto_peerlist(struct peer *peer_list, int raw4, int raw6, int addr_len, + const uint8_t *icmppkt, unsigned icmppkt_len, int orig_ttl); /* sched.c */ int taskset(int taskset_cpu);