diff --git a/avr/cores/MCUdude_corefiles/Arduino.h b/avr/cores/MCUdude_corefiles/Arduino.h
index 78eeb8a2..fe006267 100755
--- a/avr/cores/MCUdude_corefiles/Arduino.h
+++ b/avr/cores/MCUdude_corefiles/Arduino.h
@@ -121,11 +121,11 @@ void yield(void);
 #undef abs
 #endif
 
-#define abs(x)       __builtin_abs(x)
+#define abs(x)       ({ typeof (x) _x = (x); _x > 0 ? _x : -x; })
 #define sq(x)        ({ typeof (x) _x = (x); _x * _x; })
-#define min(a,b)     ({ typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b;    })
-#define max(a,b)     ({ typeof (a) _a = (a); typeof (b) _b = (b); _a > _b ? _a : _b;    })
-#define round(x)     ({ typeof (x) _x = (x); _x >= 0 ? (long)_x + 0.5 : (long)_x - 0.5; })
+#define min(a,b)     ({ typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; })
+#define max(a,b)     ({ typeof (a) _a = (a); typeof (b) _b = (b); _a > _b ? _a : _b; })
+#define round(x)     ({ typeof (x) _x = (x); _x >= 0 ? (long)(_x + 0.5) : (long)(_x - 0.5); })
 #define radians(deg) ((deg) * DEG_TO_RAD)
 #define degrees(rad) ((rad) * RAD_TO_DEG)
 #define constrain(x,low,high)     ({ \
diff --git a/avr/cores/MCUdude_corefiles/README.md b/avr/cores/MCUdude_corefiles/README.md
index 71e81674..8c68f6da 100644
--- a/avr/cores/MCUdude_corefiles/README.md
+++ b/avr/cores/MCUdude_corefiles/README.md
@@ -3,7 +3,9 @@
 
 This repo contains the Arduino corefiles used with [MightyCore](https://github.com/MCUdude/MightyCore), [MegaCore](https://github.com/MCUdude/MegaCore), [MiniCore](https://github.com/MCUdude/MiniCore) and [MajorCore](https://github.com/MCUdude/MightyCore).
 
+
 ## Supported devices
+
 * ATmega640, ATmega1280, ATmega2560
 * ATmega64, ATmega128, ATmega1281, ATmega2561
 * AT90CAN32, AT90CAN64, AT90CAN128
@@ -11,20 +13,92 @@ This repo contains the Arduino corefiles used with [MightyCore](https://github.c
 * ATmega8515, ATmega162
 * ATmega8, ATmega48/P/PA/PB, ATmega88/P/PA/PB, ATmega168/P/PA/PB, ATmega328/P/PA/PB
 
+
 ## Supported clock frequencies
-By supported I mean clocks that accurate timing is implemented for (millis, micros, delay, delayMicroseconds).
+
+By supported I mean clocks that accurate timing is implemented for (millis,
+micros, delay, delayMicroseconds).
+
 * 32 MHz
+* 25 MHz
 * 24 MHz
+* 22.1184 MHz
 * 20 MHz
 * 18.432 MHz
+* 18 MHz
+* 16.5 MHz
 * 16 MHz
 * 14.7456 MHz
 * 12 MHz
 * 11.0592 MHz
+* 10 MHz
+* 9.216 MHz
 * 8 MHz
 * 7.3728 MHz
+* 6 MHz
 * 4 MHz
 * 3.6864 MHz
 * 2 MHz
 * 1.8432 MHz
 * 1 MHz
+
+
+### Adding further clock frequencies
+
+The calculation of `millis()`, `micros()` and `delay()` is automatic for
+arbitrary frequencies.
+Depending on the prime factors of the frequency, it is either exact or
+approximate to 60 ppm accuracy (worst-case).
+The only thing required is adding support in `delayMicroseconds()`.
+
+
+### Exactness of `delayMicroseconds()`
+
+The `delayMicroseconds(unsigned int us)` implementation is exact up to a few
+cycles for the frequencies listed above.
+
+The maximum input parameter to work reliably is 10000 for 10 milliseconds.
+Its result is affected by interrupts occurring, which may prolong the delay.
+
+
+### Exactness of `micros()` and `delay()`
+
+For the clock speeds listed above, `micros()` is corrected to zero drift.
+Even for very long run times, the `micros()` function will precisely follow the
+oscillator used.
+
+Frequencies not listed above are either exact or corrected to below 60 ppm drift
+and in exact sync with `millis()`.
+
+Note that the result of `micros()` may jump up by several microseconds between
+consecutive calls and rolls over after one hour and eleven minutes.
+
+The `delay()` function uses `micros()` internally and inherits its drift accuracy
+with slight variations due to function call overhead and processing.
+It is immune to interrupts and thus long-term accurate.
+
+
+### Exactness of `millis()`
+
+For the clock speeds listed above, `millis()` is corrected to zero drift.
+Even for very long run times, the `millis()` function will precisely follow the
+oscillator used.
+
+Frequencies not listed above are either exact or corrected to below 60 ppm drift
+and in exact sync with `micros()` and `delay()`.
+
+We do not register the rollover of the `unsigned long` millis counter that
+occurs every 49.7 days; such would have to be done in the user's program.
+Often this is not necessary:  The code
+
+    if (millis() - millis_old >= interval) {
+      /* do something */
+      millis_old += interval;
+    }
+
+is long-term accurate even when rolling over provided `millis_old` is of type
+`unsigned long`.
+
+For clock speeds of 16 MHz and below, the return value of `millis()`
+occasionally jumps up by more than one (notwithstanding low/zero drift).
+Thus, when relying on consecutive returns, run at 16.5 MHz or higher.
diff --git a/avr/cores/MCUdude_corefiles/wiring.c b/avr/cores/MCUdude_corefiles/wiring.c
index 6b0df7f7..9d2bf2ec 100755
--- a/avr/cores/MCUdude_corefiles/wiring.c
+++ b/avr/cores/MCUdude_corefiles/wiring.c
@@ -24,29 +24,131 @@
 
 // the prescaler is set so that timer0 ticks every 64 clock cycles, and the
 // the overflow handler is called every 256 ticks.
-// 24MHz: An overflow happens every 682.67 microseconds ---> 0.04167, so this results in 682 
+// 24MHz: An overflow happens every 682.67 microseconds ---> 0.04167, so this results in 682
 // 20MHz: An overflow happens every 819.2 microseconds ---> 0,05 (time of a cycle in micros) * 64 (timer0 tick) * 256 (every 256 ticks timer0 overflows), so this results in 819
 // 16MHz: An overflow happens every 1024 microseconds
+#if 0
+// this would be inaccurate for non-power-of-two frequencies
 #define MICROSECONDS_PER_TIMER0_OVERFLOW (clockCyclesToMicroseconds(64 * 256))
+#else
+// It is vital to avoid unnecessary roundoff in this calculation.
+// What we really want to compute is the number of microseconds in one
+// timer cycle, thus 64 * 256 * 1e6 / F_CPU.  When calculating with integers,
+// the product 64 * 256 * 1000**2 overflows an unsigned long.  We resolve this
+// by recognizing that F_CPU is evenly divisible by 10 in all cases.  Thus, we
+// cancel a factor of 10 on both sides, which allows us to use unsigned long.
+// It turns out that code runs faster when the number is explicitly unsigned!
+#define MICROSECONDS_PER_TIMER0_OVERFLOW \
+  (64UL * 256UL * 100000UL / ((F_CPU + 5UL) / 10UL))
+#endif
 
 // the whole number of milliseconds per timer0 overflow
 // For 20MHz this would be 0 (because of 819)
 // For 16MHz this would be 1 (because of 1024)
-#define MILLIS_INC (MICROSECONDS_PER_TIMER0_OVERFLOW / 1000)
+#define MILLIS_INC (MICROSECONDS_PER_TIMER0_OVERFLOW / 1000U)
 
 // the fractional number of milliseconds per timer0 overflow. we shift right
 // by three to fit these numbers into a byte. (for the clock speeds we care
 // about - 8 and 16 MHz - this doesn't lose precision.)
 // For 16 MHz: 24 (1024 % 1000) gets shifted right by 3 which results in 3   (precision was lost)
 // For 20 MHz: 819 (819 % 1000) gets shifted right by 3 which results in 102 (precision was lost)
-// For 24 MHz: 682 (682 % 1000) gets shifted right by 3 which results in 
-#define FRACT_INC ((MICROSECONDS_PER_TIMER0_OVERFLOW % 1000) >> 3)
+// For 24 MHz: 682 (682 % 1000) gets shifted right by 3 which results in
+#define FRACT_INC ((MICROSECONDS_PER_TIMER0_OVERFLOW % 1000U) >> 3)
 // Shift right by 3 to fit in a byte (results in 125)
-#define FRACT_MAX (1000 >> 3)
+#define FRACT_MAX (1000U >> 3)
 
-volatile unsigned long timer0_overflow_count = 0;
 volatile unsigned long timer0_millis = 0;
-static unsigned char timer0_fract = 0;
+volatile unsigned char timer0_fract = 0;
+
+// Add a correction calculation to make millis () exact for most clocks.
+// The idea is to compare the exact microseconds/8 between overflows,
+// namely (1. / F_CPU * 64. * 256. * 1e6) % 1000 / 8.,
+// with the integer rounded down version in FRACT_INC.
+// For the clock speeds examined below, we encounter four different cases.
+// The low case: FRACT_INC is too low by a fraction 1 / n.
+//               Correct by adding 1 to the fract counter every n times.
+// The high case: FRACT_INC is too low by a fraction (n - 1) / n.
+//               Add 1 to the fract counter always except every n times.
+// A special case for 20 MHz: FRACT_INC is too low by the fraction 2. / 5.
+//               Correct by adding 2 out of 5 times: every odd number in 0..4.
+// A special case for 11.0592 MHz: FRACT_INC is too low by 5. / 27.
+//               Correct brute force by counting 5 out of 27.
+//               Do it the same way for the remaining odd cases.
+// This way we correct losses from both the rounding to usecs and the shift.
+// For the remaining non-exact cases, we use a highly accurate approximation.
+// This happens to be exact, too, for leftover UART-related frequencies.
+#define FRACT_INC_PLUS
+#define EXACT_NUM (64UL * 256UL * 125UL * 100UL)
+#define EXACT_DEN ((F_CPU + 5UL) / 10UL)
+#define EXACT_REM (EXACT_NUM - (EXACT_NUM / EXACT_DEN) * EXACT_DEN)
+#if EXACT_REM > 0 || MICROSECONDS_PER_TIMER0_OVERFLOW % 256 > 0 // correct
+#define CORRECT_EXACT_MILLIS
+#define CORRECT_EXACT_MICROS
+#if F_CPU == 25000000L          // for 25 MHz we get 81.92, off by 23./25.
+#define CORRECT_BRUTE 23
+#define CORRECT_ROLL 25
+#elif F_CPU == 24000000L        // for 24 MHz we get 85.33, off by 1./3.
+#define CORRECT_LO
+#define CORRECT_ROLL 3
+#elif F_CPU == 22118400L        // for 22.1184 MHz we get 92 + 16./27.
+#define CORRECT_BRUTE 16
+#define CORRECT_ROLL 27
+#elif F_CPU == 20000000L        // for 20 MHz we get 102.4, off by 2./5.
+#define CORRECT_ODD
+#define CORRECT_ROLL 5
+#elif F_CPU == 18432000L        // for 18.432 MHz we get 111.11, off by 1./9.
+#define CORRECT_LO
+#define CORRECT_ROLL 9
+#elif F_CPU == 18000000L        // for 18 MHz we get 113.78, off by 7./9.
+#define CORRECT_BRUTE 7
+#define CORRECT_ROLL 9
+#elif F_CPU == 16500000L        // for 16.5 MHz we get 124 + 4./33.
+#define CORRECT_BRUTE 4
+#define CORRECT_ROLL 33
+#elif F_CPU == 14745600L        // for 14.7456 MHz we get 13.89, off by 8./9.
+#define CORRECT_HI
+#define CORRECT_ROLL 9
+#elif F_CPU == 12000000L        // for 12 MHz we get 45.67, off by 2./3.
+#define CORRECT_HI
+#define CORRECT_ROLL 3
+#elif F_CPU == 11059200L        // for 11.0592 MHz we get 60 + 5./27.
+#define CORRECT_BRUTE 5
+#define CORRECT_ROLL 27
+#elif F_CPU == 10000000L        // for 10 MHz we get 79.8, off by 4./5.
+#define CORRECT_HI
+#define CORRECT_ROLL 5
+#elif F_CPU == 9216000L         // for 9.216 MHz we get 97. + 2./9.
+#define CORRECT_BRUTE 2
+#define CORRECT_ROLL 9
+#elif F_CPU == 7372800L         // for 7.3728 MHz we get 27 + 7./9.
+#define CORRECT_BRUTE 7
+#define CORRECT_ROLL 9
+#elif F_CPU == 6000000L         // for 6 MHz we get 91 + 1./3.
+#define CORRECT_LO
+#define CORRECT_ROLL 3
+#elif F_CPU == 3686400L         // for 3.6864 MHz we get 55 + 5./9.
+#define CORRECT_BRUTE 5
+#define CORRECT_ROLL 9
+#elif F_CPU == 1843200L         // for 1.8432 MHz we get 111.11, off by 1./9.
+#define CORRECT_LO
+#define CORRECT_ROLL 9
+#else                           // fallback accurate to better than 60 ppm
+#define CORRECT_BRUTE ((2U * 135U * EXACT_REM + EXACT_DEN) / (2U * EXACT_DEN))
+#define CORRECT_ROLL 135
+#if CORRECT_BRUTE <= 0
+#undef CORRECT_EXACT_MILLIS     // low corner case amounts to nothing
+#elif CORRECT_BRUTE >= CORRECT_ROLL
+#undef CORRECT_EXACT_MILLIS
+#undef FRACT_INC_PLUS
+#define FRACT_INC_PLUS + 1      // high corner case always adds one extra
+#endif
+#endif // fallback
+#endif // EXACT_REM > 0
+
+#ifndef CORRECT_EXACT_MICROS
+// variable is only needed in micros() calculation without exactness correction
+volatile unsigned long timer0_overflow_count = 0;
+#endif
 
 // timer0 interrupt routine ,- is called every time timer0 overflows
 #if defined(__AVR_ATtiny24__) || defined(__AVR_ATtiny44__) || defined(__AVR_ATtiny84__)
@@ -55,21 +157,59 @@ ISR(TIM0_OVF_vect)
 ISR(TIMER0_OVF_vect)
 #endif
 {
+#ifdef CORRECT_EXACT_MILLIS
+  // this is a variable that retains its value between calls
+  static unsigned char timer0_exact = 0;
+#endif
+
   // copy these to local variables so they can be stored in registers
   // (volatile variables must be read from memory on every access, so this saves time)
   unsigned long m = timer0_millis;
   unsigned char f = timer0_fract;
 
-  m += MILLIS_INC;
-  f += FRACT_INC;
+  f += FRACT_INC FRACT_INC_PLUS;
+
+#ifdef CORRECT_EXACT_MILLIS
+  // correct millis () to be exact for certain clocks
+  if (timer0_exact == CORRECT_ROLL - 1) {
+    timer0_exact = 0;
+#ifdef CORRECT_LO
+    ++f;
+#endif
+  }
+  else {
+    ++timer0_exact;
+#ifdef CORRECT_HI
+    ++f;
+#endif
+  }
+  // it does not matter for the long-time drift whether the following two
+  // corrections take place before or after the increment of timer0_exact
+#ifdef CORRECT_ODD
+  if (timer0_exact & 1) {
+    ++f;
+  }
+#endif
+#ifdef CORRECT_BRUTE
+  if (timer0_exact < CORRECT_BRUTE) {
+    ++f;
+  }
+#endif
+#endif // CORRECT_EXACT_MILLIS
+
   if (f >= FRACT_MAX) {
     f -= FRACT_MAX;
-    m += 1;
+    m += MILLIS_INC + 1;
+  }
+  else {
+    m += MILLIS_INC;
   }
 
   timer0_fract = f;
   timer0_millis = m;
+#ifndef CORRECT_EXACT_MICROS
   timer0_overflow_count++;
+#endif
 }
 
 unsigned long millis()
@@ -88,13 +228,24 @@ unsigned long millis()
 
 unsigned long micros() {
   unsigned long m;
-  uint8_t oldSREG = SREG;
+#ifdef CORRECT_EXACT_MICROS
+  unsigned char f; // temporary storage for millis fraction counter
+  unsigned char q = 0; // record whether an overflow is flagged
+#endif
   // t will be the number where the timer0 counter stopped
   uint8_t t;
+  uint8_t oldSREG = SREG;
 
   // Stop all interrupts
   cli();
+
+#ifdef CORRECT_EXACT_MICROS
+  // combine exact millisec and 8usec counters
+  m = timer0_millis;
+  f = timer0_fract;
+#else
   m = timer0_overflow_count;
+#endif
 
   // TCNT0 : The Timer Counter Register
 #if defined(TCNT0)
@@ -108,77 +259,148 @@ unsigned long micros() {
   // Timer0 Interrupt Flag Register
 #ifdef TIFR0
   if ((TIFR0 & _BV(TOV0)) && (t < 255))
+#ifndef CORRECT_EXACT_MICROS
     m++;
+#else
+    q = 1;
+#endif
 #else
   if ((TIFR & _BV(TOV0)) && (t < 255))
+#ifndef CORRECT_EXACT_MICROS
     m++;
+#else
+    q = 1;
+#endif
 #endif
   // Restore SREG
   SREG = oldSREG;
 
-#if F_CPU >= 24000000L && F_CPU < 32000000L
+#ifdef CORRECT_EXACT_MICROS
+  /* We convert milliseconds, fractional part and timer value
+     into a microsecond value.  Relies on CORRECT_EXACT_MILLIS.
+     Basically we multiply by 1000 and add the scaled timer.
+
+     The leading part by m and f is long-term accurate.
+     For the timer we just need to be close from below.
+     Must never be too high, or micros jumps backwards. */
+  m = (((m << 7) - (m << 1) - m + f) << 3) +
+      ((t * MICROSECONDS_PER_TIMER0_OVERFLOW) >> 8);
+  return q ? m + MICROSECONDS_PER_TIMER0_OVERFLOW : m;
+#elif 1
+  /* All power-of-two Megahertz frequencies enter here, as well as 12.8 MHz.
+     We only end up here if right shift before multiplication is exact. */
+  return ((m << 8) + t) * (MICROSECONDS_PER_TIMER0_OVERFLOW >> 8);
+#else
+/*
+ * This is the old code requiring individual treatment for each frequency.
+ * It has the following accuracy for non-power-of-two MHz frequencies.
+ *
+ * 20 MHz has a drift of 1 in 65536 (~15 ppm)
+ * 18.432 Mhz has a drift of 1 in 64000 (~16 ppm)
+ * 25 MHz      has a drift of 1 in 43691 (~23 ppm)
+ * 14.7456 MHz has a drift of 1 in 10000 (100 ppm)
+ *  7.3728 MHz has a drift of 1 in 10000
+ *  3.6864 MHz has a drift of 1 in 10000
+ *  1.8432 MHz has a drift of 1 in 10000
+ * 24 MHz has a drift of 1 in 4096 (244 ppm)
+ * 18 MHz has a drift of 1 in 4096
+ * 12 MHz has a drift of 1 in 4096
+ * 22.1184 MHz has a drift of 1 in 2857 (350ppm)
+ * 11.0592 MHz has a drift of 1 in 2857
+*/
+#if F_CPU >= 32000000L
+  // we need to put this #if here to avoid entering the wrong branch for 32 MHz
+  return ((m << 8) + t) * (64 / clockCyclesPerMicrosecond());
+#elif F_CPU >= 25000000L
+  // m needs to be multiplied by 655.36
+  // and t by 2.56 ~ 5243 / 2048. for an error of 1 in 43691 (23 ppm)
+  m = (m << 8) + t;
+  // How many shift adds does it take until long multiply becomes faster?
+  // Can we just return (m * 41943UL) >> 14 and be done to 1ppm accuracy.
+  return (m << 2) - m - (m >> 1) + (m >> 4) - (m >> 9) - (m >> 11);
+#elif F_CPU >= 24000000L
   // m needs to be multiplied by 682.67
-  // and t by 2.67
+  // and t by 2.667 ~ 1365 / 512. for an error of 1 in 4096 (244 ppm)
+  m = (m << 8) + t;
+  m = (m << 1) + (m >> 1) + (m >> 3);
+  return m + (m >> 6);
+#elif F_CPU >= 22118400L
+  // m needs to be multiplied by 740.74
+  // and t by 2.894 ~ 741 / 256. for an error of 1 in 2857 (350 ppm)
   m = (m << 8) + t;
-  return (m << 1) + (m >> 1) + (m >> 3) + (m >> 4); // Multiply by 2.6875
+  return m + (m << 1) - (m >> 3) + (m >> 6) + (m >> 8);
 #elif F_CPU >= 20000000L
-  // m needs to be multiplied by 819.2 
-  // t needs to be multiplied by 3.2
+  // m needs to be multiplied by 819.2
+  // and t by 16. / 5. = 3.2 ~ 819 / 256. for an error of 1 in 4096
   m = (m << 8) + t;
-  return m + (m << 1) + (m >> 2) - (m >> 4); // Multiply by 3.1875
+  m = (m << 2) - m;
+  // return m + (m >> 4) + (m >> 8);
+  // improve further to 3.19995 ~ 13107 / 4096. for an error of 15 ppm
+  m += (m >> 4);
+  return m + (m >> 8);
 #elif F_CPU >= 18432000L
-  // m needs to be multiplied by 888.88
-  // and t by 3.47
+  // m needs to be multiplied by 888.89
+  // and t by 125. / 36. ~ 3.472 ~ 889. / 256. for an error of 1 in 8000
   m = (m << 8) + t;
-  return m + (m << 1) + (m >> 1); // Multiply by 3.5
+  // return (m << 2) - (m >> 1) - (m >> 5) + (m >> 8);
+  // improve further to 3.47217 ~ 7111. / 2048. for an error of 16 ppm
+  return (m << 2) - (m >> 1) - (m >> 5) + (m >> 8) - (m >> 11);
+#elif F_CPU >= 18000000L
+  // m needs to be multiplied by 910.22
+  // and t by 3.556 ~ 910. / 256. for an error of 1 in 4096
+  m = (m << 8) + t;
+  m = (m << 2) - (m >> 1);
+  return m + (m >> 6);
 #elif F_CPU >= 14745600L && F_CPU != 16000000L
-  // m needs to be multiplied by 1111.1
-  // and t by 4.34
+  // m needs to be multiplied by 1111.11
+  // and t by 4.34 ~ 1111. / 256. for an error of 100 ppm
   m = (m << 8) + t;
-  return (m << 2) + (m >> 1) - (m >> 3) - (m >> 4); // Multiply by 4.3125
+  return (m << 2) + (m >> 1) - (m >> 3) - (m >> 5) - (m >> 8);
 #elif F_CPU >= 12000000L && F_CPU != 16000000L
   // m needs to be multiplied by 1365.33
-  // and t by 5.33
+  // and t by 5.33 ~ 1365. / 256. for an error of 1 in 4096
   m = (m << 8) + t;
-  return m + (m << 2) + (m >> 2) + (m >> 3) - (m >> 4) + (m >> 5); // Multiply by 5.3437
+  m += (m << 2) + (m >> 2);
+  return m + (m >> 6);
 #elif F_CPU >= 11059200L && F_CPU != 16000000L
   // m needs to be multiplied by 1481.48
-  // and t by 5.78
+  // and t by 5.789 ~ 1482. / 256. for an error of 1 in 2857
   m = (m << 8) + t;
-  return (m << 2) + (m << 1) - (m >> 2) + (m >> 5); // Multiply by 5.78125
+  return (m << 3) - (m << 1) - (m >> 2) + (m >> 5) + (m >> 7);
 #elif F_CPU == 7372800L
   // m needs to be multiplied by 2222.22
-  // and t by 8.68
+  // and t by 8.68 ~ 2222. / 256. for an error of 100 ppm
   m = (m << 8) + t;
-  return (m << 3) + m - (m >> 2) - (m >> 3); // Multiply by 8.625
+  return (m << 3) + m - (m >> 2) - (m >> 4) - (m >> 7);
 #elif F_CPU == 3686400L
   // m needs to be multiplied by 4444.44
-  // and t by 17.36
+  // and t by 17.36 ~ 4444. / 256. for an error of 100 ppm
   m = (m << 8) + t;
-  return (m << 4) + m + (m >> 1) - (m >> 3) - (m >> 6); // Multiply by 17.359375
+  return (m << 4) + (m << 1) - (m >> 1) - (m >> 3) - (m >> 6);
 #elif F_CPU == 1843200L
   // m needs to be multiplied by 8888.88
-  // and t by 34.72
+  // and t by 34.72 ~ 8888. / 256. for an error of 100 ppm
   m = (m << 8) + t;
-  return (m << 5) + (m << 1) + (m >> 1) + (m >> 2); // Multiply by 34.75
+  return (m << 5) + (m << 2) - m - (m >> 2) - (m >> 5);
 #else
   // 32 MHz, 24 MHz, 16 MHz, 8 MHz, 4 MHz, 1 MHz
-  // Shift by 8 to the left (multiply by 256) so t (which is 1 byte in size) can fit in 
+  // Shift by 8 to the left (multiply by 256) so t (which is 1 byte in size) can fit in
   // m & t are multiplied by 4 (since it was already multiplied by 256)
   // t is multiplied by 4
   return ((m << 8) + t) * (64 / clockCyclesPerMicrosecond());
 #endif
+#endif // 0
 }
 
 void delay(unsigned long ms)
 {
-  uint32_t start = micros();
+  unsigned long start = micros();
 
-  while (ms > 0) {
+  while (ms > 0UL) {
     yield();
-    while ( ms > 0 && (micros() - start) >= 1000) {
+    while (ms > 0UL && (micros() - start) >= 1000UL) {
       ms--;
-      start += 1000;
+      start += 1000UL;
     }
   }
 }
@@ -193,11 +415,15 @@ void delay(unsigned long ms)
  * In Arduino IDE 1.6.11 and newer LTO is enabled by default.  The LTO optimizes the code
  * at link time, making the code (often) significantly smaller without making it "slower"
  * and sometimes destroy acccurate software timings like delayMicroseconds() with lower values.
- * To avoid LTO optimization, the line of delayMicrosecons() definition in arduino.h must be replace to this:
+ * To avoid LTO optimization, the line of delayMicroseconds() definition in arduino.h must be replaced by this:
  * void delayMicroseconds(unsigned int) __attribute__ ((noinline)) ;
  */
 void delayMicroseconds(unsigned int us)
 {
+  // Question:
+  // We multiply `us' by as much as 6 below.  This reduces the available range of us.
+  // Updated README to define the safe calling range to 0 .. 10000 us.
+
   // call = 4 cycles + 1 to 4 cycles to init us(2 for constant delay, 4 for variable,
   //                                            1 for register variable)
 
@@ -206,6 +432,9 @@ void delayMicroseconds(unsigned int us)
   //delay_us(us);
 
 #if F_CPU >= 32000000L
+  // we catch this case so we don't underrun by subtraction
+  if (us == 0) return;           // 3 cycles (.1us) on false, which we ignore
+
   // the following loop takes a 1/4 of a microsecond (8 cycles with nops)
   // per iteration, so execute it four times for each microsecond of
   // delay requested.
@@ -224,8 +453,11 @@ void delayMicroseconds(unsigned int us)
 // # elif F_CPU >= 29491200L
 
 #elif F_CPU >= 25000000L
+  // we catch this case so we don't underrun by subtraction
+  if (us == 0) return;           // 3 cycles (.1us) on false, which we ignore
+
   // the following loop takes a 1/5 of a microsecond (5 cycles)
-  // per iteration, so execute it six times for each microsecond of
+  // per iteration, so execute it five times for each microsecond of
   // delay requested.
   us = (us << 2) + us; // x5 us, = 7 cycles
 
@@ -240,6 +472,9 @@ void delayMicroseconds(unsigned int us)
 #elif F_CPU >= 24000000L
   // for the 24 MHz external clock if someone is working with USB
 
+  // we catch this case so we don't underrun by subtraction
+  if (us == 0) return;           // 3 cycles (.1us) on false, which we ignore
+
   // the following loop takes a 1/6 of a microsecond (4 cycles)
   // per iteration, so execute it six times for each microsecond of
   // delay requested.
@@ -250,7 +485,38 @@ void delayMicroseconds(unsigned int us)
   // us is at least 6 so we can substract 5
   us -= 5; // = 2 cycles
 
-// #elif F_CPU >= 22118400L
+#elif F_CPU >= 22118400L
+  // this is basically the same as for 11.0592, except multiplying by 6, not 3.
+  // the correction factor is the same, but the multiply takes 4 cycles longer.
+
+  // the overhead of the function call is 14 (16) cycles which is ~2/3 us
+  if (us <= 1) return; // = 3 cycles, (4 when true)
+
+  us *= 6; // x6 us, = 9 cycles [{ us = (us<<2)+(us<<1); = 9 cycles too }]
+
+                       // +1 cycle (register save)
+  if (us > 16) // = 3 cycles
+  {
+    // since the loop is not accurately 1/6 of a microsecond we need
+    // to multiply us by 0.9216 (11.0592 / 12 = 22.1184 / 24)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    // this drops us to at least 15
+
+    // account for the time taken in the preceeding commands.
+    // we just burned 57 (59) cycles above, remove 14 (14*4=56),
+    // us is at least 15 so we may subtract 14 alright
+    us -= 14; // = 2 cycles
+  }
+  else
+  {
+    // account for the time taken in the preceeding commands.
+    // we just burned 31 (33) cycles above, remove 8 (8*4=32),
+    // user wants to wait at least 2 us, after multiply us >= 12
+
+             // 1 cycle when if jump here
+    us -= 8; // 2 cycles
+             // 2 cycles to jump back to delay cycle.
+  }
 
 #elif F_CPU >= 20000000L
   __asm__ __volatile__ (
@@ -290,17 +556,18 @@ void delayMicroseconds(unsigned int us)
 
                        // +1 cycle (register save)
   // user wants to wait longer than 3 us
-  if (us > 15) // = 3 cycles
+  if (us > 17) // = 3 cycles
   {
     // Since the loop is not accurately 1/5 of a microsecond we need
     // to multiply us by 0.9216 (18.432 / 20)
-    us = (us * 60398L) >> 16;   // x0.9216 us = 29 cycles (60398 = 0.9216 * 0x10000L)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 * 0x10000L)
+    // this drops us to at least 16
 
     // account for the time taken in the preceeding commands.
     // we just burned 59 (61) cycles above, remove 15, (15*4=60)
     us -= 15; // = 2 cycles
   }
-  else 
+  else
   {
     // account for the time taken in the preceeding commands.
     // we just burned 33 (35) cycles above, remove 9, (9*4=36)
@@ -310,6 +577,42 @@ void delayMicroseconds(unsigned int us)
              // 2 cycles to jump back to delay cycle.
   }
 
+#elif F_CPU >= 18000000L
+  // for the 18 MHz clock, if somebody is working with USB
+  // or otherwise relating to 12 or 24 MHz clocks
+
+  // for a 1 microsecond delay, simply return.  the overhead
+  // of the function call takes 14 (16) cycles, which is .8 us
+  if (us <= 1) return; // = 3 cycles, (4 when true)
+
+  // make the loop below last 6 cycles
+#undef  _MORENOP_
+#define _MORENOP_ " nop \n\t  nop \n\t"
+
+  // the following loop takes 1/3 of a microsecond (6 cycles) per iteration,
+  // so execute it three times for each microsecond of delay requested.
+  us = (us << 1) + us; // x3 us, = 5 cycles
+
+  // account for the time taken in the preceeding commands.
+  // we just burned 20 (22) cycles above, remove 3 (3*6=18),
+  // us is at least 6 so we may subtract 3
+  us -= 3; // = 2 cycles
+
+#elif F_CPU >= 16500000L
+  // for a one-microsecond delay, simply return.  the overhead
+  // of the function call takes 14 (16) cycles, which is about 1us
+  if (us <= 1) return; //  = 3 cycles, (4 when true)
+
+  // the following loop takes 1/4 of a microsecond (4 cycles) times 32./33.
+  // per iteration, thus rescale us by 4. * 33. / 32. = 4.125 to compensate
+  us = (us << 2) + (us >> 3); // x4.125 with 22 cycles
+
+  // account for the time taken in the preceding commands.
+  // we burned 37 (39) cycles above, plus 2 below, remove 10 (4*10=40)
+  // us is at least 8, so we subtract only 7 to keep it positive
+  // the error is below one microsecond and not worth extra code
+  us -= 7; // = 2 cycles
+
 #elif F_CPU >= 16000000L
   // for a one-microsecond delay, simply return.  the overhead
   // of the function call takes 14 (16) cycles, which is 1 us
@@ -337,7 +640,8 @@ void delayMicroseconds(unsigned int us)
   {
     // Since the loop is not accurately 1/4 of a microsecond we need
     // to multiply us by 0.9216 (14.7456 / 16)
-    us = (us * 60398L) >> 16;   // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    // this drops us to at least 14
 
     // account for the time taken in the preceeding commands.
     // we just burned 53 (57) cycles above, remove 13, (13*4=52)
@@ -377,12 +681,13 @@ void delayMicroseconds(unsigned int us)
   us = (us << 1) + us; // x3 us, = 5 cycles
 
                        // +1 cycle (register save)
-  // user wants to wait longer than 4 us
-  if (us > 14) // = 3 cycles
+  // user wants to wait longer than 5 us
+  if (us > 15) // = 3 cycles
   {
     // since the loop is not accurately 1/3 of a microsecond we need
     // to multiply us by 0.9216 (11.0592 / 12)
-    us = (us * 60398L) >> 16;   // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    // this drops us to at least 14
 
     // account for the time taken in the preceeding commands.
     // we just burned 53 (55) cycles above, remove 13, (13*4=52)
@@ -391,7 +696,8 @@ void delayMicroseconds(unsigned int us)
   else
   {
     // account for the time taken in the preceeding commands.
-    // we just burned 27 (29) cycles above, remove 7, (7*4=28)
+    // we just burned 27 (29) cycles above, remove 7 (7*4=28),
+    // us is at least 9, so we may subtract without rollunder
 
              // 1 cycle when if jump here
     us -= 7; // 2 cycles
@@ -406,41 +712,47 @@ void delayMicroseconds(unsigned int us)
   if (us <= 2) return; // = 3 cycles, (4 when true)
 
   // the following loop takes 2/5 of a microsecond (4 cycles)
-  // per iteration, so execute it three times for each microsecond of
-  // delay requested.
+  // per iteration, so execute it five times for every 2 microseconds
+  // of delay requested.
   us = (us << 1) + (us >> 1); // x2.5 us, = 7 cycles
 
   // account for the time taken in the preceeding commands.
-  // we just burned 22 (24) cycles above, remove 5, (5*4=20)
-  // us is at least 20 so we can substract 5
-  us -= 5; // = 2 cycles
+  // we burn 22 (24) cycles above plus 2 below, remove 6, (6*4=24)
+  // us is at least 7 so we can subtract 6
+  us -= 6; // = 2 cycles
 
 #elif F_CPU >= 9216000L
   // the overhead of the function call is 14 (16) cycles which is ~1.5 us
-  if (us <= 3) return; // = 3 cycles, (4 when true)
+  if (us <= 2) return; // = 3 cycles, (4 when true)
 
-  us = (us << 2) + us ; // x2.5x2 us, = 7 cycles
+  // factor of 10 in multiplying by 2 and making the loop last 5 cycles
+  us <<= 1; // x2 us, = 2 cycles
+
+  // make the delay loop last 5 cycles
+#undef  _MORENOP_
+#define _MORENOP_ " nop \n\t"
 
                        // +1 cycle (register save)
-  // user wants to wait longer than 6 us
-  if (us > 30) // = 3 cycles
+  // user wants to wait longer than 5 us
+  if (us > 11) // = 3 cycles
   {
-    // since the loop is not accurately 2/5 of a microsecond we need
+    // since the loop is not accurately 1/2 of a microsecond we need
     // to multiply us by 0.9216 (11.0592 / 12)
-    us = (us * 30199L) >> 16;   // x(0.9216/2) us = 29 cycles (30199 = 0.4608 x 0x10000L)
+    us = (us * 60398UL) >> 16;   // x(0.9216) us = 29 cycles
+    // this drops us to at least 11
 
     // account for the time taken in the preceeding commands.
-    // we just burned 53 (55) cycles above, remove 13, (13*4=52)
-    us -= 13; // = 2 cycles
+    // we just burned 48 (50) cycles above, remove 10 (10*5=50)
+    us -= 10; // = 2 cycles
   }
   else
   {
     // account for the time taken in the preceeding commands.
-    // we just burned 31 (33) cycles above, remove 8, (8*4=32)
+    // we just burned 26 (28) cycles above, remove 5 (5*5=25)
+    // us is at least 6 so we may subtract 5
 
               // 1 cycle when if jump here
-    us >>= 1; // 2 cycles restore x2.5 us
-    us -=  8; // 2 cycles
+    us -= 5;  // 2 cycles
               // 2 cycles to jump back to delay cycle.
   }
 
@@ -479,7 +791,8 @@ void delayMicroseconds(unsigned int us)
   {
     // since the loop is not accurately 1/2 of a microsecond we need
     // to multiply us by 0.9216 (7.3728 / 8)
-    us = (us * 60398L) >> 16;   // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    // this drops us to at least 14
 
     // account for the time taken in the preceeding commands.
     // we just burned 52 (54) cycles above, remove 13, (13*4=52)
@@ -496,6 +809,20 @@ void delayMicroseconds(unsigned int us)
              // 2 cycles to jump back to delay cycle.
   }
 
+#elif F_CPU >= 6000000L
+  // for a 1 to 3 microsecond delay, simply return.  the overhead
+  // of the function call takes 14 (16) cycles, which is 2.5us
+  if (us <= 3) return; //  = 3 cycles, (4 when true)
+
+  // make the loop below last 6 cycles
+#undef  _MORENOP_
+#define _MORENOP_ " nop \n\t  nop \n\t"
+
+  // the following loop takes 1 microsecond (6 cycles) per iteration
+  // we burned 15 (17) cycles above, plus 2 below, remove 3 (3 * 6 = 18)
+  // us is at least 4 so we can subtract 3
+  us -= 3; // = 2 cycles
+
 #elif F_CPU >= 4000000L
   __asm__ __volatile__ ("nop"); // just waiting 1 cycle
   // the overhead of the function call is 15 (17) cycles which is 4 us
@@ -511,13 +838,16 @@ void delayMicroseconds(unsigned int us)
   // of the function call takes 14 (16) cycles, which is almost 4 us
   if (us <= 6) return; // = 3 cycles, (4 when true)
 
+                       // Question:
+                       // Are we certain that there is a register save?
                        // +1 cycle (register save)
-  // user wants to wait longer than 12 us
-  if (us > 12) // = 3 cycles
+  // user wants to wait longer than 14 us
+  if (us > 14) // = 3 cycles
   {
     // since the loop is not accurately 1 microsecond we need
     // to multiply us by 0.9216 ( = 3.6864 / 4)
-    us = (us * 60398L) >> 16;   // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    // this drops us to at least 13
 
     // account for the time taken in the preceeding commands.
     // we just burned 47 (49) cycles above, remove 12, (12*4=48)
@@ -551,15 +881,16 @@ void delayMicroseconds(unsigned int us)
 #elif F_CPU >= 1843200L
   // for less than 13 microsecond delay, simply return. the overhead
   // of the function call takes 14 (16) cycles, which is almost 8 us
-  if (us <= 12) return; // = 3 cycles, (4 when true)
+  if (us <= 13) return; // = 3 cycles, (4 when true)
 
                         // no register save here
-  // user wants to wait longer than 25 us
-  if (us > 25) // = 3 cycles
+  // user wants to wait longer than 54 us
+  if (us > 54) // = 3 cycles
   {
     // since the loop takes ~2.17 microseconds we need
     // to multiply us by 0.4608 ( = 1.8432 / 2 / 2 )
     us = (us * 30199L) >> 16;   // x(0.9216/2) us = 29 cycles (30199 = 0.4608 x 0x10000L)
+    // this drops us to at least 25
 
     // account for the time taken in the preceeding commands.
     // we just burned 47 (49) cycles above, remove 24, microseconds
@@ -572,6 +903,7 @@ void delayMicroseconds(unsigned int us)
 
               // 1 cycle when if jump here
     us -= 12; // 2 cycles
+    // this drops us to at least 2 and we divide by 2 below
 
     us >>= 1; // division by 2 = 2 cycles
               // 2 cycles to jump back to delay cycle.
@@ -591,13 +923,14 @@ void delayMicroseconds(unsigned int us)
   // us is at least 4, divided by 4 gives us 1 (no zero delay bug)
   us >>= 2; // us div 4, = 4 cycles
 
-
 #endif
 
   // busy wait
   __asm__ __volatile__ (
     "1: sbiw %0,1" "\n\t"            // 2 cycles
-        _MORENOP_                    // 4 cycles if 32 MHz or 1 cycle if 25 MHz
+        _MORENOP_                    // 4 cycles if 32 MHz or
+                                     // 1 cycle  if 25, 9.216
+                                     // 2 cycles if 18, 6 MHz
     "   brne 1b"                     // 2 cycles
     : /* no outputs */
     : "w" (us)
@@ -698,9 +1031,9 @@ void init()
 
 #if defined(TCCR4A) && defined(TCCR4B) && defined(TCCR4D)
   TCCR4B |= _BV(CS42) | _BV(CS41) | _BV(CS40); // Set timer 4 prescale factor to 64
-  TCCR4D |= _BV(WGM40);                        // Put timer 4 in phase- and frequency-correct PWM mode 
+  TCCR4D |= _BV(WGM40);                        // Put timer 4 in phase- and frequency-correct PWM mode
   TCCR4A |= _BV(PWM4A);                        // Enable PWM mode for comparator OCR4A
-  TCCR4C |= _BV(PWM4D);                        // Enable PWM mode for comparator OCR4D 
+  TCCR4C |= _BV(PWM4D);                        // Enable PWM mode for comparator OCR4D
 #elif defined(TCCR4B) && defined(CS41) && defined(WGM40)
   TCCR4B |= _BV(CS41) | _BV(CS40); // Set timer 4 prescale factor to 64
   TCCR4A |= _BV(WGM40);            // Put timer 4 in 8-bit phase correct pwm mode
diff --git a/avr/cores/MCUdude_corefiles/wiring_analog.c b/avr/cores/MCUdude_corefiles/wiring_analog.c
index 3e609533..64010031 100755
--- a/avr/cores/MCUdude_corefiles/wiring_analog.c
+++ b/avr/cores/MCUdude_corefiles/wiring_analog.c
@@ -77,7 +77,7 @@ int analogRead(uint8_t pin)
   ADCSRA |= _BV(ADSC);
 
   // ADSC is cleared when the conversion finishes
-  while (bit_is_set(ADCSRA, ADSC));
+  while (ADCSRA & _BV(ADSC));
 
   // we have to read ADCL first; doing so locks both ADCL
   // and ADCH until ADCH is read.  reading ADCL second would