dns: implement tcp fallback in __res_msend query core
[musl] / src / network / res_msend.c
1 #include <sys/socket.h>
2 #include <netinet/in.h>
3 #include <netinet/tcp.h>
4 #include <netdb.h>
5 #include <arpa/inet.h>
6 #include <stdint.h>
7 #include <string.h>
8 #include <poll.h>
9 #include <time.h>
10 #include <ctype.h>
11 #include <unistd.h>
12 #include <errno.h>
13 #include <pthread.h>
14 #include "stdio_impl.h"
15 #include "syscall.h"
16 #include "lookup.h"
17
18 static void cleanup(void *p)
19 {
20         struct pollfd *pfd = p;
21         for (int i=0; pfd[i].fd >= -1; i++)
22                 if (pfd[i].fd >= 0) __syscall(SYS_close, pfd[i].fd);
23 }
24
25 static unsigned long mtime()
26 {
27         struct timespec ts;
28         clock_gettime(CLOCK_REALTIME, &ts);
29         return (unsigned long)ts.tv_sec * 1000
30                 + ts.tv_nsec / 1000000;
31 }
32
33 static int start_tcp(struct pollfd *pfd, int family, const void *sa, socklen_t sl, const unsigned char *q, int ql)
34 {
35         struct msghdr mh = {
36                 .msg_name = (void *)sa,
37                 .msg_namelen = sl,
38                 .msg_iovlen = 2,
39                 .msg_iov = (struct iovec [2]){
40                         { .iov_base = (uint8_t[]){ ql>>8, ql }, .iov_len = 2 },
41                         { .iov_base = (void *)q, .iov_len = ql } }
42         };
43         int r;
44         int fd = socket(family, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
45         pfd->fd = fd;
46         pfd->events = POLLOUT;
47         if (!setsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN_CONNECT,
48             &(int){1}, sizeof(int))) {
49                 r = sendmsg(fd, &mh, MSG_FASTOPEN|MSG_NOSIGNAL);
50                 if (r == ql+2) pfd->events = POLLIN;
51                 if (r >= 0) return r;
52                 if (errno == EINPROGRESS) return 0;
53         }
54         r = connect(fd, sa, sl);
55         if (!r || errno == EINPROGRESS) return 0;
56         close(fd);
57         pfd->fd = -1;
58         return -1;
59 }
60
61 static void step_mh(struct msghdr *mh, size_t n)
62 {
63         /* Adjust iovec in msghdr to skip first n bytes. */
64         while (mh->msg_iovlen && n >= mh->msg_iov->iov_len) {
65                 n -= mh->msg_iov->iov_len;
66                 mh->msg_iov++;
67                 mh->msg_iovlen--;
68         }
69         if (!mh->msg_iovlen) return;
70         mh->msg_iov->iov_base = (char *)mh->msg_iov->iov_base + n;
71         mh->msg_iov->iov_len -= n;
72 }
73
74 /* Internal contract for __res_msend[_rc]: asize must be >=512, nqueries
75  * must be sufficiently small to be safe as VLA size. In practice it's
76  * either 1 or 2, anyway. */
77
78 int __res_msend_rc(int nqueries, const unsigned char *const *queries,
79         const int *qlens, unsigned char *const *answers, int *alens, int asize,
80         const struct resolvconf *conf)
81 {
82         int fd;
83         int timeout, attempts, retry_interval, servfail_retry;
84         union {
85                 struct sockaddr_in sin;
86                 struct sockaddr_in6 sin6;
87         } sa = {0}, ns[MAXNS] = {{0}};
88         socklen_t sl = sizeof sa.sin;
89         int nns = 0;
90         int family = AF_INET;
91         int rlen;
92         int next;
93         int i, j;
94         int cs;
95         struct pollfd pfd[nqueries+2];
96         int qpos[nqueries], apos[nqueries];
97         unsigned char alen_buf[nqueries][2];
98         int r;
99         unsigned long t0, t1, t2;
100
101         pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, &cs);
102
103         timeout = 1000*conf->timeout;
104         attempts = conf->attempts;
105
106         for (nns=0; nns<conf->nns; nns++) {
107                 const struct address *iplit = &conf->ns[nns];
108                 if (iplit->family == AF_INET) {
109                         memcpy(&ns[nns].sin.sin_addr, iplit->addr, 4);
110                         ns[nns].sin.sin_port = htons(53);
111                         ns[nns].sin.sin_family = AF_INET;
112                 } else {
113                         sl = sizeof sa.sin6;
114                         memcpy(&ns[nns].sin6.sin6_addr, iplit->addr, 16);
115                         ns[nns].sin6.sin6_port = htons(53);
116                         ns[nns].sin6.sin6_scope_id = iplit->scopeid;
117                         ns[nns].sin6.sin6_family = family = AF_INET6;
118                 }
119         }
120
121         /* Get local address and open/bind a socket */
122         fd = socket(family, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
123
124         /* Handle case where system lacks IPv6 support */
125         if (fd < 0 && family == AF_INET6 && errno == EAFNOSUPPORT) {
126                 for (i=0; i<nns && conf->ns[nns].family == AF_INET6; i++);
127                 if (i==nns) {
128                         pthread_setcancelstate(cs, 0);
129                         return -1;
130                 }
131                 fd = socket(AF_INET, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
132                 family = AF_INET;
133                 sl = sizeof sa.sin;
134         }
135         sa.sin.sin_family = family;
136         if (fd < 0 || bind(fd, (void *)&sa, sl) < 0) {
137                 if (fd >= 0) close(fd);
138                 pthread_setcancelstate(cs, 0);
139                 return -1;
140         }
141
142         /* Past this point, there are no errors. Each individual query will
143          * yield either no reply (indicated by zero length) or an answer
144          * packet which is up to the caller to interpret. */
145
146         for (i=0; i<nqueries; i++) pfd[i].fd = -1;
147         pfd[nqueries].fd = fd;
148         pfd[nqueries].events = POLLIN;
149         pfd[nqueries+1].fd = -2;
150
151         pthread_cleanup_push(cleanup, pfd);
152         pthread_setcancelstate(cs, 0);
153
154         /* Convert any IPv4 addresses in a mixed environment to v4-mapped */
155         if (family == AF_INET6) {
156                 setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &(int){0}, sizeof 0);
157                 for (i=0; i<nns; i++) {
158                         if (ns[i].sin.sin_family != AF_INET) continue;
159                         memcpy(ns[i].sin6.sin6_addr.s6_addr+12,
160                                 &ns[i].sin.sin_addr, 4);
161                         memcpy(ns[i].sin6.sin6_addr.s6_addr,
162                                 "\0\0\0\0\0\0\0\0\0\0\xff\xff", 12);
163                         ns[i].sin6.sin6_family = AF_INET6;
164                         ns[i].sin6.sin6_flowinfo = 0;
165                         ns[i].sin6.sin6_scope_id = 0;
166                 }
167         }
168
169         memset(alens, 0, sizeof *alens * nqueries);
170
171         retry_interval = timeout / attempts;
172         next = 0;
173         t0 = t2 = mtime();
174         t1 = t2 - retry_interval;
175
176         for (; t2-t0 < timeout; t2=mtime()) {
177                 /* This is the loop exit condition: that all queries
178                  * have an accepted answer. */
179                 for (i=0; i<nqueries && alens[i]>0; i++);
180                 if (i==nqueries) break;
181
182                 if (t2-t1 >= retry_interval) {
183                         /* Query all configured namservers in parallel */
184                         for (i=0; i<nqueries; i++)
185                                 if (!alens[i])
186                                         for (j=0; j<nns; j++)
187                                                 sendto(fd, queries[i],
188                                                         qlens[i], MSG_NOSIGNAL,
189                                                         (void *)&ns[j], sl);
190                         t1 = t2;
191                         servfail_retry = 2 * nqueries;
192                 }
193
194                 /* Wait for a response, or until time to retry */
195                 if (poll(pfd, nqueries+1, t1+retry_interval-t2) <= 0) continue;
196
197                 while (next < nqueries &&
198                   (rlen = recvfrom(fd, answers[next], asize, 0,
199                   (void *)&sa, (socklen_t[1]){sl})) >= 0) {
200
201                         /* Ignore non-identifiable packets */
202                         if (rlen < 4) continue;
203
204                         /* Ignore replies from addresses we didn't send to */
205                         for (j=0; j<nns && memcmp(ns+j, &sa, sl); j++);
206                         if (j==nns) continue;
207
208                         /* Find which query this answer goes with, if any */
209                         for (i=next; i<nqueries && (
210                                 answers[next][0] != queries[i][0] ||
211                                 answers[next][1] != queries[i][1] ); i++);
212                         if (i==nqueries) continue;
213                         if (alens[i]) continue;
214
215                         /* Only accept positive or negative responses;
216                          * retry immediately on server failure, and ignore
217                          * all other codes such as refusal. */
218                         switch (answers[next][3] & 15) {
219                         case 0:
220                         case 3:
221                                 break;
222                         case 2:
223                                 if (servfail_retry && servfail_retry--)
224                                         sendto(fd, queries[i],
225                                                 qlens[i], MSG_NOSIGNAL,
226                                                 (void *)&ns[j], sl);
227                         default:
228                                 continue;
229                         }
230
231                         /* Store answer in the right slot, or update next
232                          * available temp slot if it's already in place. */
233                         alens[i] = rlen;
234                         if (i == next)
235                                 for (; next<nqueries && alens[next]; next++);
236                         else
237                                 memcpy(answers[i], answers[next], rlen);
238
239                         /* Ignore further UDP if all slots full or TCP-mode */
240                         if (next == nqueries) pfd[nqueries].events = 0;
241
242                         /* If answer is truncated (TC bit), fallback to TCP */
243                         if (answers[i][2] & 2) {
244                                 alens[i] = -1;
245                                 pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, 0);
246                                 r = start_tcp(pfd+i, family, ns+j, sl, queries[i], qlens[i]);
247                                 pthread_setcancelstate(cs, 0);
248                                 if (r >= 0) {
249                                         qpos[i] = r;
250                                         apos[i] = 0;
251                                 }
252                                 continue;
253                         }
254                 }
255
256                 for (i=0; i<nqueries; i++) if (pfd[i].revents & POLLOUT) {
257                         struct msghdr mh = {
258                                 .msg_iovlen = 2,
259                                 .msg_iov = (struct iovec [2]){
260                                         { .iov_base = (uint8_t[]){ qlens[i]>>8, qlens[i] }, .iov_len = 2 },
261                                         { .iov_base = (void *)queries[i], .iov_len = qlens[i] } }
262                         };
263                         step_mh(&mh, qpos[i]);
264                         r = sendmsg(pfd[i].fd, &mh, MSG_NOSIGNAL);
265                         if (r < 0) goto out;
266                         qpos[i] += r;
267                         if (qpos[i] == qlens[i]+2)
268                                 pfd[i].events = POLLIN;
269                 }
270
271                 for (i=0; i<nqueries; i++) if (pfd[i].revents & POLLIN) {
272                         struct msghdr mh = {
273                                 .msg_iovlen = 2,
274                                 .msg_iov = (struct iovec [2]){
275                                         { .iov_base = alen_buf[i], .iov_len = 2 },
276                                         { .iov_base = answers[i], .iov_len = asize } }
277                         };
278                         step_mh(&mh, apos[i]);
279                         r = recvmsg(pfd[i].fd, &mh, 0);
280                         if (r < 0) goto out;
281                         apos[i] += r;
282                         if (apos[i] < 2) continue;
283                         int alen = alen_buf[i][0]*256 + alen_buf[i][1];
284                         if (alen < 13) goto out;
285                         if (apos[i] < alen+2 && apos[i] < asize+2)
286                                 continue;
287                         int rcode = answers[i][3] & 15;
288                         if (rcode != 0 && rcode != 3)
289                                 goto out;
290
291                         /* Storing the length here commits the accepted answer.
292                          * Immediately close TCP socket so as not to consume
293                          * resources we no longer need. */
294                         alens[i] = alen;
295                         __syscall(SYS_close, pfd[i].fd);
296                         pfd[i].fd = -1;
297                 }
298         }
299 out:
300         pthread_cleanup_pop(1);
301
302         /* Disregard any incomplete TCP results */
303         for (i=0; i<nqueries; i++) if (alens[i]<0) alens[i] = 0;
304
305         return 0;
306 }
307
308 int __res_msend(int nqueries, const unsigned char *const *queries,
309         const int *qlens, unsigned char *const *answers, int *alens, int asize)
310 {
311         struct resolvconf conf;
312         if (__get_resolv_conf(&conf, 0, 0) < 0) return -1;
313         return __res_msend_rc(nqueries, queries, qlens, answers, alens, asize, &conf);
314 }