From 522e9778eaf37c99ef15cdebdeee58cedfa5af5c Mon Sep 17 00:00:00 2001 From: Kim Alvefur Date: Mon, 24 Feb 2020 20:21:46 +0100 Subject: net.dns: Handle being loaded outside of Prosody `if timer ...` suggests that this was intended, but it did not work because net.timer depends on net.server which refuses to be loaded outside of Prosody. --- net/dns.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/dns.lua') diff --git a/net/dns.lua b/net/dns.lua index 3902f95c..193067e3 100644 --- a/net/dns.lua +++ b/net/dns.lua @@ -13,7 +13,7 @@ local socket = require "socket"; -local timer = require "util.timer"; +local have_timer, timer = pcall(require, "util.timer"); local new_ip = require "util.ip".new_ip; local have_util_net, util_net = pcall(require, "util.net"); @@ -871,7 +871,7 @@ function resolver:query(qname, qtype, qclass) -- - - - - - - - - - -- query set(self.wanted, qclass, qtype, qname, co, true); end - if timer and self.timeout then + if have_timer and self.timeout then local num_servers = #self.server; local i = 1; timer.add_task(self.timeout, function () -- cgit v1.2.3 From 5f4fcad1127da29e34759b5b54b6bbd3a42da5aa Mon Sep 17 00:00:00 2001 From: Matthew Wild Date: Thu, 25 Jun 2020 15:22:40 +0100 Subject: net.dns: Add some debug logging --- net/dns.lua | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/dns.lua') diff --git a/net/dns.lua b/net/dns.lua index 193067e3..67eaa647 100644 --- a/net/dns.lua +++ b/net/dns.lua @@ -17,6 +17,8 @@ local have_timer, timer = pcall(require, "util.timer"); local new_ip = require "util.ip".new_ip; local have_util_net, util_net = pcall(require, "util.net"); +local log = require "util.logger".init("dns"); + local _, windows = pcall(require, "util.windows"); local is_windows = (_ and windows) or os.getenv("WINDIR"); @@ -877,6 +879,7 @@ function resolver:query(qname, qtype, qclass) -- - - - - - - - - - -- query timer.add_task(self.timeout, function () if get(self.wanted, qclass, qtype, qname, co) then if i < num_servers then + log("debug", "DNS request timeout %d/%d", i, num_servers) i = i + 1; self:servfail(conn); o.server = self.best_server; @@ -904,6 +907,7 @@ function resolver:servfail(sock, err) -- Find all requests to the down server, and retry on the next server self.time = socket.gettime(); + log("debug", "servfail %d (of %d)", num, #self.server); for id,queries in pairs(self.active) do for question,o in pairs(queries) do if o.server == num then -- This request was to the broken server -- cgit v1.2.3 From 3ff48b8386fdc9faf6e7f9d99a93858ec3a49728 Mon Sep 17 00:00:00 2001 From: Matthew Wild Date: Thu, 25 Jun 2020 15:28:23 +0100 Subject: net.dns: Fix timeout retry logic On timeout the query would be resent twice - once within servfail(), and again inside the timeout callback. This commit moves all retry logic to servfail(). --- net/dns.lua | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'net/dns.lua') diff --git a/net/dns.lua b/net/dns.lua index 67eaa647..1dcb0479 100644 --- a/net/dns.lua +++ b/net/dns.lua @@ -856,6 +856,9 @@ function resolver:query(qname, qtype, qclass) -- - - - - - - - - - -- query server = self.best_server, delay = 1, retry = socket.gettime() + self.delays[1] + qclass = qclass; + qtype = qtype; + qname = qname; }; -- remember the query @@ -878,19 +881,14 @@ function resolver:query(qname, qtype, qclass) -- - - - - - - - - - -- query local i = 1; timer.add_task(self.timeout, function () if get(self.wanted, qclass, qtype, qname, co) then - if i < num_servers then log("debug", "DNS request timeout %d/%d", i, num_servers) i = i + 1; - self:servfail(conn); - o.server = self.best_server; - conn, err = self:getsocket(o.server); - if conn then - conn:send(o.packet); - return self.timeout; - end - end - -- Tried everything, failed - self:cancel(qclass, qtype, qname); + self:servfail(self.socket[o.server]); +-- end + end + -- Still outstanding? (i.e. retried) + if get(self.wanted, qclass, qtype, qname, co) then + return self.timeout; -- Then wait end end) end @@ -917,12 +915,19 @@ function resolver:servfail(sock, err) end o.retries = (o.retries or 0) + 1; - if o.retries >= #self.server then - --print('timeout'); - queries[question] = nil; - else + local retried; + if o.retries < #self.server then sock, err = self:getsocket(o.server); - if sock then sock:send(o.packet); end + if sock then + retried = true; + log("debug", "retry %d (immediate)", o.retries); + sock:send(o.packet); + end + end + if not retried then + log("debug", 'tried all servers, giving up'); + self:cancel(o.qclass, o.qtype, o.qname); + queries[question] = nil; end end end -- cgit v1.2.3 From d080fee3235102dd6a20cdfcd53f1ee080a1266b Mon Sep 17 00:00:00 2001 From: Matthew Wild Date: Thu, 25 Jun 2020 15:29:49 +0100 Subject: net.dns: Add jitter to spread queries and reduce failures due to congestion --- net/dns.lua | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'net/dns.lua') diff --git a/net/dns.lua b/net/dns.lua index 1dcb0479..6f5f28d4 100644 --- a/net/dns.lua +++ b/net/dns.lua @@ -72,6 +72,8 @@ local ztact = { -- public domain 20080404 lua@ztact.com local get, set = ztact.get, ztact.set; local default_timeout = 15; +local default_jitter = 1; +local default_retry_jitter = 2; -------------------------------------------------- module dns local _ENV = nil; @@ -668,6 +670,8 @@ end resolver.delays = { 1, 3 }; +resolver.jitter = have_timer and default_jitter or nil; +resolver.retry_jitter = have_timer and default_retry_jitter or nil; function resolver:addnameserver(address) -- - - - - - - - - - addnameserver self.server = self.server or {}; @@ -855,7 +859,7 @@ function resolver:query(qname, qtype, qclass) -- - - - - - - - - - -- query packet = header..question, server = self.best_server, delay = 1, - retry = socket.gettime() + self.delays[1] + retry = socket.gettime() + self.delays[1]; qclass = qclass; qtype = qtype; qname = qname; @@ -869,7 +873,13 @@ function resolver:query(qname, qtype, qclass) -- - - - - - - - - - -- query if not conn then return nil, err; end - conn:send (o.packet) + if self.jitter then + timer.add_task(math.random()*self.jitter, function () + conn:send(o.packet); + end); + else + conn:send(o.packet); + end -- remember which coroutine wants the answer if co then @@ -920,8 +930,16 @@ function resolver:servfail(sock, err) sock, err = self:getsocket(o.server); if sock then retried = true; + if self.retry_jitter then + local delay = self.delays[((o.retries-1)%#self.delays)+1] + (math.random()*self.retry_jitter); + log("debug", "retry %d in %0.2fs", o.retries, delay); + timer.add_task(delay, function () + sock:send(o.packet); + end); + else log("debug", "retry %d (immediate)", o.retries); sock:send(o.packet); + end end end if not retried then -- cgit v1.2.3 From 5e744740f305fdb82046c12c9f60cecc28226faf Mon Sep 17 00:00:00 2001 From: Matthew Wild Date: Thu, 25 Jun 2020 15:31:20 +0100 Subject: net.dns: Increase backoff delays Not entirely happy with the overall logic here. --- net/dns.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/dns.lua') diff --git a/net/dns.lua b/net/dns.lua index 6f5f28d4..ae3f947c 100644 --- a/net/dns.lua +++ b/net/dns.lua @@ -668,7 +668,7 @@ end -- socket layer -------------------------------------------------- socket layer -resolver.delays = { 1, 3 }; +resolver.delays = { 1, 2, 3, 5 }; resolver.jitter = have_timer and default_jitter or nil; resolver.retry_jitter = have_timer and default_retry_jitter or nil; -- cgit v1.2.3 From 6daae1f629f669872221186e7d2f9124fd66873f Mon Sep 17 00:00:00 2001 From: Matthew Wild Date: Thu, 25 Jun 2020 15:34:29 +0100 Subject: net.dns: Reduce default timeout to 5s Most healthy queries will return well within this time, and the new retry logic should help spread the cost of additional retries. --- net/dns.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/dns.lua') diff --git a/net/dns.lua b/net/dns.lua index ae3f947c..18cf51d6 100644 --- a/net/dns.lua +++ b/net/dns.lua @@ -71,7 +71,7 @@ local ztact = { -- public domain 20080404 lua@ztact.com }; local get, set = ztact.get, ztact.set; -local default_timeout = 15; +local default_timeout = 5; local default_jitter = 1; local default_retry_jitter = 2; -- cgit v1.2.3 From fb5059547f3171087410344fcf0bffcb8f5f1433 Mon Sep 17 00:00:00 2001 From: Matthew Wild Date: Sun, 28 Jun 2020 12:02:10 +0100 Subject: net.dns: Disable jitter for default resolver (used by blocking dns.lookup() calls) This fixes 'prosodyctl check dns' being slow. --- net/dns.lua | 1 + 1 file changed, 1 insertion(+) (limited to 'net/dns.lua') diff --git a/net/dns.lua b/net/dns.lua index 18cf51d6..17119152 100644 --- a/net/dns.lua +++ b/net/dns.lua @@ -1191,6 +1191,7 @@ end local _resolver = dns.resolver(); dns._resolver = _resolver; +_resolver.jitter, _resolver.retry_jitter = false, false; function dns.lookup(...) -- - - - - - - - - - - - - - - - - - - - - lookup return _resolver:lookup(...); -- cgit v1.2.3