From caf26c7898ffb7d8853a1dd21fbe71f95ae3948c Mon Sep 17 00:00:00 2001
From: MCUdude <hansibull@gmail.com>
Date: Fri, 19 Feb 2021 21:22:15 +0100
Subject: [PATCH] Squashed 'avr/cores/MCUdude_corefiles/' changes from
 fa31758e9..a367b8b9a

a367b8b9a Merge pull request #32 from cburstedde/feature-exact-timing
e7f9313ce Merge branch 'master' into feature-exact-timing
c24da8fcb Updated accuracy calculation
7efce6186 Replace bit_is_set with "regular" and operation
5fba4671d Minimal accuracy tweaks
0f33b77ce Merge pull request #31 from cburstedde/feature-exact-timing
d10b60393 Update README
299220515 Exact or very accurate millis/micros/delay always
065732108 Non-functional tweaks
0dd91a679 Update README
25fad398a Add exact timing for 9.216 MHz
19b8514f5 Merge branch 'master' into feature-exact-timing
5d4ba89e8 Correct millis and micros for arbitrary frequencies
8913fb360 Add missing parentheses
b622f9856 Add exact timing for 10 MHz
0333bcea5 Update README
5af5045c8 Add accurate timing for 16.5 MHz
e318ce1ca Add exact timing for 6 MHz
0c0b4f3a8 Remove superfluous cast
aeaadea0f Merge pull request #29 from cburstedde/feature-delayMicroseconds
4e74473d7 Merge pull request #28 from cburstedde/feature-correct-micros
6916530f2 Fix several corner cases in delayMicroseconds()
038385eea It makes a difference using unsigned math!!
912145b81 Go back to long multiply but still optimize
909105605 Use faster unsigned int multiply in micros()
eeb6635e7 Merge pull request #27 from cburstedde/feature-exact-timing
21d1d6fa7 README typo
9be3a3e42 Disable alternate micros algorithm for powers of 2
bcd8baa5a Move timer0_exact static variable def into ISR
745508971 Rename correction #define and variable
9fef70e27 Reduce unnecessary long comparison to char
95166c10e Correct bad idea: access fract with interrupts off
f64aa5d03 Remove unneeded unsigned char in micros()
a91536407 White space in wiring.c
f683310fa Make micros zero-drift based on exact millis
b28acdb84 Make 25 MHz a supported clock frequency
2a6c810ce Comments and README edits
7cde04dd0 Merge pull request #26 from cburstedde/feature-delayMicroseconds
0172627e2 Merge branch 'master' into feature-delayMicroseconds
3b1142ffd Merge pull request #25 from cburstedde/feature-correct-micros
e858df7d8 Merge branch 'master' into feature-correct-micros
b32ca2bc5 Merge pull request #24 from cburstedde/feature-correct-millis
0205fd55b delayMicroseconds() safe for us == 0 and >= 24 MHz
032bed72f Statement on delayMicroseconds() in README
afa0f8f69 Use 60398UL unsigned long constant
9b7291ae3 Add 22.1184 and 18 MHz cases to delayMicroseconds
ec16def12 Add/edit comments for delayMicroseconds()
4a89b74ed Replace abs macro ... with a "safe" version of the original abs macro. Unlike __builtin_abs() this also deals with floats, which is pretty much required to ensure compatibility against the official Arduino core(s).
57b7486af Tune ppm values in README
059d20b18 Optimize away two increments from timing ISR
3d890c22d Improve micros() to 100 ppm for 7.37, 3.68, 1.84
d852091be Tiny README update
39112b389 Simplify millis calculation to fit into long int
d1719372c Improve micros() accuracy for 14.7456, 12, 11.0952
84c8f335d micros() below crystal tolerance for 18.432, 20 MHz
b8002ce05 Update README for new/improved micros()
239425178 Make micros calculation more efficient for 20 MHz
d736735fd Add 18 MHz clock to micros()
21cf2177e Improve accuracy of 24 MHz micros()
3b9aa8990 Add 18 MHz clock to millis() calculation
65aa77f4d Fix timing error for non-power-of-two clocks
0091354f8 Add discussion of micros and delay to README
af3b94c79 Non-functional clarification
2568a3f85 Add millis() discussion to README
56512e963 Add millis () correction to make 22.1184 MHz exact
2d42b729d Add 22.1184 MHz micros () correction
46fa6940c Prevent micros for 32 MHz to enter wrong case
49f5f250a Increase micros () accuracy for 18.432 and 20 MHz
b51058fe1 Supply millis () correction for 24 MHz
993843d4d Add two odd frequencies 7.37 and 3.69 MHz
25c1c988b Correction makes millis() exact for several speeds

git-subtree-dir: avr/cores/MCUdude_corefiles
git-subtree-split: a367b8b9ab17653a379108e48d7696bf1c6ca336
---
 Arduino.h       |   8 +-
 README.md       |  76 +++++++-
 wiring.c        | 485 ++++++++++++++++++++++++++++++++++++++++--------
 wiring_analog.c |   2 +-
 4 files changed, 489 insertions(+), 82 deletions(-)

diff --git a/Arduino.h b/Arduino.h
index 78eeb8a2d..fe0062677 100755
--- a/Arduino.h
+++ b/Arduino.h
@@ -121,11 +121,11 @@ void yield(void);
 #undef abs
 #endif
 
-#define abs(x)       __builtin_abs(x)
+#define abs(x)       ({ typeof (x) _x = (x); _x > 0 ? _x : -x; })
 #define sq(x)        ({ typeof (x) _x = (x); _x * _x; })
-#define min(a,b)     ({ typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b;    })
-#define max(a,b)     ({ typeof (a) _a = (a); typeof (b) _b = (b); _a > _b ? _a : _b;    })
-#define round(x)     ({ typeof (x) _x = (x); _x >= 0 ? (long)_x + 0.5 : (long)_x - 0.5; })
+#define min(a,b)     ({ typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; })
+#define max(a,b)     ({ typeof (a) _a = (a); typeof (b) _b = (b); _a > _b ? _a : _b; })
+#define round(x)     ({ typeof (x) _x = (x); _x >= 0 ? (long)(_x + 0.5) : (long)(_x - 0.5); })
 #define radians(deg) ((deg) * DEG_TO_RAD)
 #define degrees(rad) ((rad) * RAD_TO_DEG)
 #define constrain(x,low,high)     ({ \
diff --git a/README.md b/README.md
index 71e81674d..8c68f6da3 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,9 @@
 
 This repo contains the Arduino corefiles used with [MightyCore](https://github.com/MCUdude/MightyCore), [MegaCore](https://github.com/MCUdude/MegaCore), [MiniCore](https://github.com/MCUdude/MiniCore) and [MajorCore](https://github.com/MCUdude/MightyCore).
 
+
 ## Supported devices
+
 * ATmega640, ATmega1280, ATmega2560
 * ATmega64, ATmega128, ATmega1281, ATmega2561
 * AT90CAN32, AT90CAN64, AT90CAN128
@@ -11,20 +13,92 @@ This repo contains the Arduino corefiles used with [MightyCore](https://github.c
 * ATmega8515, ATmega162
 * ATmega8, ATmega48/P/PA/PB, ATmega88/P/PA/PB, ATmega168/P/PA/PB, ATmega328/P/PA/PB
 
+
 ## Supported clock frequencies
-By supported I mean clocks that accurate timing is implemented for (millis, micros, delay, delayMicroseconds).
+
+By supported I mean clocks that accurate timing is implemented for (millis,
+micros, delay, delayMicroseconds).
+
 * 32 MHz
+* 25 MHz
 * 24 MHz
+* 22.1184 MHz
 * 20 MHz
 * 18.432 MHz
+* 18 MHz
+* 16.5 MHz
 * 16 MHz
 * 14.7456 MHz
 * 12 MHz
 * 11.0592 MHz
+* 10 MHz
+* 9.216 MHz
 * 8 MHz
 * 7.3728 MHz
+* 6 MHz
 * 4 MHz
 * 3.6864 MHz
 * 2 MHz
 * 1.8432 MHz
 * 1 MHz
+
+
+### Adding further clock frequencies
+
+The calculation of `millis()`, `micros()` and `delay()` is automatic for
+arbitrary frequencies.
+Depending on the prime factors of the frequency, it is either exact or
+approximate to 60 ppm accuracy (worst-case).
+The only thing required is adding support in `delayMicroseconds()`.
+
+
+### Exactness of `delayMicroseconds()`
+
+The `delayMicroseconds(unsigned int us)` implementation is exact up to a few
+cycles for the frequencies listed above.
+
+The maximum input parameter to work reliably is 10000 for 10 milliseconds.
+Its result is affected by interrupts occurring, which may prolong the delay.
+
+
+### Exactness of `micros()` and `delay()`
+
+For the clock speeds listed above, `micros()` is corrected to zero drift.
+Even for very long run times, the `micros()` function will precisely follow the
+oscillator used.
+
+Frequencies not listed above are either exact or corrected to below 60 ppm drift
+and in exact sync with `millis()`.
+
+Note that the result of `micros()` may jump up by several microseconds between
+consecutive calls and rolls over after one hour and eleven minutes.
+
+The `delay()` function uses `micros()` internally and inherits its drift accuracy
+with slight variations due to function call overhead and processing.
+It is immune to interrupts and thus long-term accurate.
+
+
+### Exactness of `millis()`
+
+For the clock speeds listed above, `millis()` is corrected to zero drift.
+Even for very long run times, the `millis()` function will precisely follow the
+oscillator used.
+
+Frequencies not listed above are either exact or corrected to below 60 ppm drift
+and in exact sync with `micros()` and `delay()`.
+
+We do not register the rollover of the `unsigned long` millis counter that
+occurs every 49.7 days; such would have to be done in the user's program.
+Often this is not necessary:  The code
+
+    if (millis() - millis_old >= interval) {
+      /* do something */
+      millis_old += interval;
+    }
+
+is long-term accurate even when rolling over provided `millis_old` is of type
+`unsigned long`.
+
+For clock speeds of 16 MHz and below, the return value of `millis()`
+occasionally jumps up by more than one (notwithstanding low/zero drift).
+Thus, when relying on consecutive returns, run at 16.5 MHz or higher.
diff --git a/wiring.c b/wiring.c
index 6b0df7f7c..9d2bf2ec2 100755
--- a/wiring.c
+++ b/wiring.c
@@ -24,29 +24,131 @@
 
 // the prescaler is set so that timer0 ticks every 64 clock cycles, and the
 // the overflow handler is called every 256 ticks.
-// 24MHz: An overflow happens every 682.67 microseconds ---> 0.04167, so this results in 682 
+// 24MHz: An overflow happens every 682.67 microseconds ---> 0.04167, so this results in 682
 // 20MHz: An overflow happens every 819.2 microseconds ---> 0,05 (time of a cycle in micros) * 64 (timer0 tick) * 256 (every 256 ticks timer0 overflows), so this results in 819
 // 16MHz: An overflow happens every 1024 microseconds
+#if 0
+// this would be inaccurate for non-power-of-two frequencies
 #define MICROSECONDS_PER_TIMER0_OVERFLOW (clockCyclesToMicroseconds(64 * 256))
+#else
+// It is vital to avoid unnecessary roundoff in this calculation.
+// What we really want to compute is the number of microseconds in one
+// timer cycle, thus 64 * 256 * 1e6 / F_CPU.  When calculating with integers,
+// the product 64 * 256 * 1000**2 overflows an unsigned long.  We resolve this
+// by recognizing that F_CPU is evenly divisible by 10 in all cases.  Thus, we
+// cancel a factor of 10 on both sides, which allows us to use unsigned long.
+// It turns out that code runs faster when the number is explicitly unsigned!
+#define MICROSECONDS_PER_TIMER0_OVERFLOW \
+  (64UL * 256UL * 100000UL / ((F_CPU + 5UL) / 10UL))
+#endif
 
 // the whole number of milliseconds per timer0 overflow
 // For 20MHz this would be 0 (because of 819)
 // For 16MHz this would be 1 (because of 1024)
-#define MILLIS_INC (MICROSECONDS_PER_TIMER0_OVERFLOW / 1000)
+#define MILLIS_INC (MICROSECONDS_PER_TIMER0_OVERFLOW / 1000U)
 
 // the fractional number of milliseconds per timer0 overflow. we shift right
 // by three to fit these numbers into a byte. (for the clock speeds we care
 // about - 8 and 16 MHz - this doesn't lose precision.)
 // For 16 MHz: 24 (1024 % 1000) gets shifted right by 3 which results in 3   (precision was lost)
 // For 20 MHz: 819 (819 % 1000) gets shifted right by 3 which results in 102 (precision was lost)
-// For 24 MHz: 682 (682 % 1000) gets shifted right by 3 which results in 
-#define FRACT_INC ((MICROSECONDS_PER_TIMER0_OVERFLOW % 1000) >> 3)
+// For 24 MHz: 682 (682 % 1000) gets shifted right by 3 which results in
+#define FRACT_INC ((MICROSECONDS_PER_TIMER0_OVERFLOW % 1000U) >> 3)
 // Shift right by 3 to fit in a byte (results in 125)
-#define FRACT_MAX (1000 >> 3)
+#define FRACT_MAX (1000U >> 3)
 
-volatile unsigned long timer0_overflow_count = 0;
 volatile unsigned long timer0_millis = 0;
-static unsigned char timer0_fract = 0;
+volatile unsigned char timer0_fract = 0;
+
+// Add a correction calculation to make millis () exact for most clocks.
+// The idea is to compare the exact microseconds/8 between overflows,
+// namely (1. / F_CPU * 64. * 256. * 1e6) % 1000 / 8.,
+// with the integer rounded down version in FRACT_INC.
+// For the clock speeds examined below, we encounter four different cases.
+// The low case: FRACT_INC is too low by a fraction 1 / n.
+//               Correct by adding 1 to the fract counter every n times.
+// The high case: FRACT_INC is too low by a fraction (n - 1) / n.
+//               Add 1 to the fract counter always except every n times.
+// A special case for 20 MHz: FRACT_INC is too low by the fraction 2. / 5.
+//               Correct by adding 2 out of 5 times: every odd number in 0..4.
+// A special case for 11.0592 MHz: FRACT_INC is too low by 5. / 27.
+//               Correct brute force by counting 5 out of 27.
+//               Do it the same way for the remaining odd cases.
+// This way we correct losses from both the rounding to usecs and the shift.
+// For the remaining non-exact cases, we use a highly accurate approximation.
+// This happens to be exact, too, for leftover UART-related frequencies.
+#define FRACT_INC_PLUS
+#define EXACT_NUM (64UL * 256UL * 125UL * 100UL)
+#define EXACT_DEN ((F_CPU + 5UL) / 10UL)
+#define EXACT_REM (EXACT_NUM - (EXACT_NUM / EXACT_DEN) * EXACT_DEN)
+#if EXACT_REM > 0 || MICROSECONDS_PER_TIMER0_OVERFLOW % 256 > 0 // correct
+#define CORRECT_EXACT_MILLIS
+#define CORRECT_EXACT_MICROS
+#if F_CPU == 25000000L          // for 25 MHz we get 81.92, off by 23./25.
+#define CORRECT_BRUTE 23
+#define CORRECT_ROLL 25
+#elif F_CPU == 24000000L        // for 24 MHz we get 85.33, off by 1./3.
+#define CORRECT_LO
+#define CORRECT_ROLL 3
+#elif F_CPU == 22118400L        // for 22.1184 MHz we get 92 + 16./27.
+#define CORRECT_BRUTE 16
+#define CORRECT_ROLL 27
+#elif F_CPU == 20000000L        // for 20 MHz we get 102.4, off by 2./5.
+#define CORRECT_ODD
+#define CORRECT_ROLL 5
+#elif F_CPU == 18432000L        // for 18.432 MHz we get 111.11, off by 1./9.
+#define CORRECT_LO
+#define CORRECT_ROLL 9
+#elif F_CPU == 18000000L        // for 18 MHz we get 113.78, off by 7./9.
+#define CORRECT_BRUTE 7
+#define CORRECT_ROLL 9
+#elif F_CPU == 16500000L        // for 16.5 MHz we get 124 + 4./33.
+#define CORRECT_BRUTE 4
+#define CORRECT_ROLL 33
+#elif F_CPU == 14745600L        // for 14.7456 MHz we get 13.89, off by 8./9.
+#define CORRECT_HI
+#define CORRECT_ROLL 9
+#elif F_CPU == 12000000L        // for 12 MHz we get 45.67, off by 2./3.
+#define CORRECT_HI
+#define CORRECT_ROLL 3
+#elif F_CPU == 11059200L        // for 11.0592 MHz we get 60 + 5./27.
+#define CORRECT_BRUTE 5
+#define CORRECT_ROLL 27
+#elif F_CPU == 10000000L        // for 10 MHz we get 79.8, off by 4./5.
+#define CORRECT_HI
+#define CORRECT_ROLL 5
+#elif F_CPU == 9216000L         // for 9.216 MHz we get 97. + 2./9.
+#define CORRECT_BRUTE 2
+#define CORRECT_ROLL 9
+#elif F_CPU == 7372800L         // for 7.3728 MHz we get 27 + 7./9.
+#define CORRECT_BRUTE 7
+#define CORRECT_ROLL 9
+#elif F_CPU == 6000000L         // for 6 MHz we get 91 + 1./3.
+#define CORRECT_LO
+#define CORRECT_ROLL 3
+#elif F_CPU == 3686400L         // for 3.6864 MHz we get 55 + 5./9.
+#define CORRECT_BRUTE 5
+#define CORRECT_ROLL 9
+#elif F_CPU == 1843200L         // for 1.8432 MHz we get 111.11, off by 1./9.
+#define CORRECT_LO
+#define CORRECT_ROLL 9
+#else                           // fallback accurate to better than 60 ppm
+#define CORRECT_BRUTE ((2U * 135U * EXACT_REM + EXACT_DEN) / (2U * EXACT_DEN))
+#define CORRECT_ROLL 135
+#if CORRECT_BRUTE <= 0
+#undef CORRECT_EXACT_MILLIS     // low corner case amounts to nothing
+#elif CORRECT_BRUTE >= CORRECT_ROLL
+#undef CORRECT_EXACT_MILLIS
+#undef FRACT_INC_PLUS
+#define FRACT_INC_PLUS + 1      // high corner case always adds one extra
+#endif
+#endif // fallback
+#endif // EXACT_REM > 0
+
+#ifndef CORRECT_EXACT_MICROS
+// variable is only needed in micros() calculation without exactness correction
+volatile unsigned long timer0_overflow_count = 0;
+#endif
 
 // timer0 interrupt routine ,- is called every time timer0 overflows
 #if defined(__AVR_ATtiny24__) || defined(__AVR_ATtiny44__) || defined(__AVR_ATtiny84__)
@@ -55,21 +157,59 @@ ISR(TIM0_OVF_vect)
 ISR(TIMER0_OVF_vect)
 #endif
 {
+#ifdef CORRECT_EXACT_MILLIS
+  // this is a variable that retains its value between calls
+  static unsigned char timer0_exact = 0;
+#endif
+
   // copy these to local variables so they can be stored in registers
   // (volatile variables must be read from memory on every access, so this saves time)
   unsigned long m = timer0_millis;
   unsigned char f = timer0_fract;
 
-  m += MILLIS_INC;
-  f += FRACT_INC;
+  f += FRACT_INC FRACT_INC_PLUS;
+
+#ifdef CORRECT_EXACT_MILLIS
+  // correct millis () to be exact for certain clocks
+  if (timer0_exact == CORRECT_ROLL - 1) {
+    timer0_exact = 0;
+#ifdef CORRECT_LO
+    ++f;
+#endif
+  }
+  else {
+    ++timer0_exact;
+#ifdef CORRECT_HI
+    ++f;
+#endif
+  }
+  // it does not matter for the long-time drift whether the following two
+  // corrections take place before or after the increment of timer0_exact
+#ifdef CORRECT_ODD
+  if (timer0_exact & 1) {
+    ++f;
+  }
+#endif
+#ifdef CORRECT_BRUTE
+  if (timer0_exact < CORRECT_BRUTE) {
+    ++f;
+  }
+#endif
+#endif // CORRECT_EXACT_MILLIS
+
   if (f >= FRACT_MAX) {
     f -= FRACT_MAX;
-    m += 1;
+    m += MILLIS_INC + 1;
+  }
+  else {
+    m += MILLIS_INC;
   }
 
   timer0_fract = f;
   timer0_millis = m;
+#ifndef CORRECT_EXACT_MICROS
   timer0_overflow_count++;
+#endif
 }
 
 unsigned long millis()
@@ -88,13 +228,24 @@ unsigned long millis()
 
 unsigned long micros() {
   unsigned long m;
-  uint8_t oldSREG = SREG;
+#ifdef CORRECT_EXACT_MICROS
+  unsigned char f; // temporary storage for millis fraction counter
+  unsigned char q = 0; // record whether an overflow is flagged
+#endif
   // t will be the number where the timer0 counter stopped
   uint8_t t;
+  uint8_t oldSREG = SREG;
 
   // Stop all interrupts
   cli();
+
+#ifdef CORRECT_EXACT_MICROS
+  // combine exact millisec and 8usec counters
+  m = timer0_millis;
+  f = timer0_fract;
+#else
   m = timer0_overflow_count;
+#endif
 
   // TCNT0 : The Timer Counter Register
 #if defined(TCNT0)
@@ -108,77 +259,148 @@ unsigned long micros() {
   // Timer0 Interrupt Flag Register
 #ifdef TIFR0
   if ((TIFR0 & _BV(TOV0)) && (t < 255))
+#ifndef CORRECT_EXACT_MICROS
     m++;
+#else
+    q = 1;
+#endif
 #else
   if ((TIFR & _BV(TOV0)) && (t < 255))
+#ifndef CORRECT_EXACT_MICROS
     m++;
+#else
+    q = 1;
+#endif
 #endif
   // Restore SREG
   SREG = oldSREG;
 
-#if F_CPU >= 24000000L && F_CPU < 32000000L
+#ifdef CORRECT_EXACT_MICROS
+  /* We convert milliseconds, fractional part and timer value
+     into a microsecond value.  Relies on CORRECT_EXACT_MILLIS.
+     Basically we multiply by 1000 and add the scaled timer.
+
+     The leading part by m and f is long-term accurate.
+     For the timer we just need to be close from below.
+     Must never be too high, or micros jumps backwards. */
+  m = (((m << 7) - (m << 1) - m + f) << 3) +
+      ((t * MICROSECONDS_PER_TIMER0_OVERFLOW) >> 8);
+  return q ? m + MICROSECONDS_PER_TIMER0_OVERFLOW : m;
+#elif 1
+  /* All power-of-two Megahertz frequencies enter here, as well as 12.8 MHz.
+     We only end up here if right shift before multiplication is exact. */
+  return ((m << 8) + t) * (MICROSECONDS_PER_TIMER0_OVERFLOW >> 8);
+#else
+/*
+ * This is the old code requiring individual treatment for each frequency.
+ * It has the following accuracy for non-power-of-two MHz frequencies.
+ *
+ * 20 MHz has a drift of 1 in 65536 (~15 ppm)
+ * 18.432 Mhz has a drift of 1 in 64000 (~16 ppm)
+ * 25 MHz      has a drift of 1 in 43691 (~23 ppm)
+ * 14.7456 MHz has a drift of 1 in 10000 (100 ppm)
+ *  7.3728 MHz has a drift of 1 in 10000
+ *  3.6864 MHz has a drift of 1 in 10000
+ *  1.8432 MHz has a drift of 1 in 10000
+ * 24 MHz has a drift of 1 in 4096 (244 ppm)
+ * 18 MHz has a drift of 1 in 4096
+ * 12 MHz has a drift of 1 in 4096
+ * 22.1184 MHz has a drift of 1 in 2857 (350ppm)
+ * 11.0592 MHz has a drift of 1 in 2857
+*/
+#if F_CPU >= 32000000L
+  // we need to put this #if here to avoid entering the wrong branch for 32 MHz
+  return ((m << 8) + t) * (64 / clockCyclesPerMicrosecond());
+#elif F_CPU >= 25000000L
+  // m needs to be multiplied by 655.36
+  // and t by 2.56 ~ 5243 / 2048. for an error of 1 in 43691 (23 ppm)
+  m = (m << 8) + t;
+  // How many shift adds does it take until long multiply becomes faster?
+  // Can we just return (m * 41943UL) >> 14 and be done to 1ppm accuracy.
+  return (m << 2) - m - (m >> 1) + (m >> 4) - (m >> 9) - (m >> 11);
+#elif F_CPU >= 24000000L
   // m needs to be multiplied by 682.67
-  // and t by 2.67
+  // and t by 2.667 ~ 1365 / 512. for an error of 1 in 4096 (244 ppm)
+  m = (m << 8) + t;
+  m = (m << 1) + (m >> 1) + (m >> 3);
+  return m + (m >> 6);
+#elif F_CPU >= 22118400L
+  // m needs to be multiplied by 740.74
+  // and t by 2.894 ~ 741 / 256. for an error of 1 in 2857 (350 ppm)
   m = (m << 8) + t;
-  return (m << 1) + (m >> 1) + (m >> 3) + (m >> 4); // Multiply by 2.6875
+  return m + (m << 1) - (m >> 3) + (m >> 6) + (m >> 8);
 #elif F_CPU >= 20000000L
-  // m needs to be multiplied by 819.2 
-  // t needs to be multiplied by 3.2
+  // m needs to be multiplied by 819.2
+  // and t by 16. / 5. = 3.2 ~ 819 / 256. for an error of 1 in 4096
   m = (m << 8) + t;
-  return m + (m << 1) + (m >> 2) - (m >> 4); // Multiply by 3.1875
+  m = (m << 2) - m;
+  // return m + (m >> 4) + (m >> 8);
+  // improve further to 3.19995 ~ 13107 / 4096. for an error of 15 ppm
+  m += (m >> 4);
+  return m + (m >> 8);
 #elif F_CPU >= 18432000L
-  // m needs to be multiplied by 888.88
-  // and t by 3.47
+  // m needs to be multiplied by 888.89
+  // and t by 125. / 36. ~ 3.472 ~ 889. / 256. for an error of 1 in 8000
   m = (m << 8) + t;
-  return m + (m << 1) + (m >> 1); // Multiply by 3.5
+  // return (m << 2) - (m >> 1) - (m >> 5) + (m >> 8);
+  // improve further to 3.47217 ~ 7111. / 2048. for an error of 16 ppm
+  return (m << 2) - (m >> 1) - (m >> 5) + (m >> 8) - (m >> 11);
+#elif F_CPU >= 18000000L
+  // m needs to be multiplied by 910.22
+  // and t by 3.556 ~ 910. / 256. for an error of 1 in 4096
+  m = (m << 8) + t;
+  m = (m << 2) - (m >> 1);
+  return m + (m >> 6);
 #elif F_CPU >= 14745600L && F_CPU != 16000000L
-  // m needs to be multiplied by 1111.1
-  // and t by 4.34
+  // m needs to be multiplied by 1111.11
+  // and t by 4.34 ~ 1111. / 256. for an error of 100 ppm
   m = (m << 8) + t;
-  return (m << 2) + (m >> 1) - (m >> 3) - (m >> 4); // Multiply by 4.3125
+  return (m << 2) + (m >> 1) - (m >> 3) - (m >> 5) - (m >> 8);
 #elif F_CPU >= 12000000L && F_CPU != 16000000L
   // m needs to be multiplied by 1365.33
-  // and t by 5.33
+  // and t by 5.33 ~ 1365. / 256. for an error of 1 in 4096
   m = (m << 8) + t;
-  return m + (m << 2) + (m >> 2) + (m >> 3) - (m >> 4) + (m >> 5); // Multiply by 5.3437
+  m += (m << 2) + (m >> 2);
+  return m + (m >> 6);
 #elif F_CPU >= 11059200L && F_CPU != 16000000L
   // m needs to be multiplied by 1481.48
-  // and t by 5.78
+  // and t by 5.789 ~ 1482. / 256. for an error of 1 in 2857
   m = (m << 8) + t;
-  return (m << 2) + (m << 1) - (m >> 2) + (m >> 5); // Multiply by 5.78125
+  return (m << 3) - (m << 1) - (m >> 2) + (m >> 5) + (m >> 7);
 #elif F_CPU == 7372800L
   // m needs to be multiplied by 2222.22
-  // and t by 8.68
+  // and t by 8.68 ~ 2222. / 256. for an error of 100 ppm
   m = (m << 8) + t;
-  return (m << 3) + m - (m >> 2) - (m >> 3); // Multiply by 8.625
+  return (m << 3) + m - (m >> 2) - (m >> 4) - (m >> 7);
 #elif F_CPU == 3686400L
   // m needs to be multiplied by 4444.44
-  // and t by 17.36
+  // and t by 17.36 ~ 4444. / 256. for an error of 100 ppm
   m = (m << 8) + t;
-  return (m << 4) + m + (m >> 1) - (m >> 3) - (m >> 6); // Multiply by 17.359375
+  return (m << 4) + (m << 1) - (m >> 1) - (m >> 3) - (m >> 6);
 #elif F_CPU == 1843200L
   // m needs to be multiplied by 8888.88
-  // and t by 34.72
+  // and t by 34.72 ~ 8888. / 256. for an error of 100 ppm
   m = (m << 8) + t;
-  return (m << 5) + (m << 1) + (m >> 1) + (m >> 2); // Multiply by 34.75
+  return (m << 5) + (m << 2) - m - (m >> 2) - (m >> 5);
 #else
   // 32 MHz, 24 MHz, 16 MHz, 8 MHz, 4 MHz, 1 MHz
-  // Shift by 8 to the left (multiply by 256) so t (which is 1 byte in size) can fit in 
+  // Shift by 8 to the left (multiply by 256) so t (which is 1 byte in size) can fit in
   // m & t are multiplied by 4 (since it was already multiplied by 256)
   // t is multiplied by 4
   return ((m << 8) + t) * (64 / clockCyclesPerMicrosecond());
 #endif
+#endif // 0
 }
 
 void delay(unsigned long ms)
 {
-  uint32_t start = micros();
+  unsigned long start = micros();
 
-  while (ms > 0) {
+  while (ms > 0UL) {
     yield();
-    while ( ms > 0 && (micros() - start) >= 1000) {
+    while (ms > 0UL && (micros() - start) >= 1000UL) {
       ms--;
-      start += 1000;
+      start += 1000UL;
     }
   }
 }
@@ -193,11 +415,15 @@ void delay(unsigned long ms)
  * In Arduino IDE 1.6.11 and newer LTO is enabled by default.  The LTO optimizes the code
  * at link time, making the code (often) significantly smaller without making it "slower"
  * and sometimes destroy acccurate software timings like delayMicroseconds() with lower values.
- * To avoid LTO optimization, the line of delayMicrosecons() definition in arduino.h must be replace to this:
+ * To avoid LTO optimization, the line of delayMicroseconds() definition in arduino.h must be replaced by this:
  * void delayMicroseconds(unsigned int) __attribute__ ((noinline)) ;
  */
 void delayMicroseconds(unsigned int us)
 {
+  // Question:
+  // We multiply `us' by as much as 6 below.  This reduces the available range of us.
+  // Updated README to define the safe calling range to 0 .. 10000 us.
+
   // call = 4 cycles + 1 to 4 cycles to init us(2 for constant delay, 4 for variable,
   //                                            1 for register variable)
 
@@ -206,6 +432,9 @@ void delayMicroseconds(unsigned int us)
   //delay_us(us);
 
 #if F_CPU >= 32000000L
+  // we catch this case so we don't underrun by subtraction
+  if (us == 0) return;           // 3 cycles (.1us) on false, which we ignore
+
   // the following loop takes a 1/4 of a microsecond (8 cycles with nops)
   // per iteration, so execute it four times for each microsecond of
   // delay requested.
@@ -224,8 +453,11 @@ void delayMicroseconds(unsigned int us)
 // # elif F_CPU >= 29491200L
 
 #elif F_CPU >= 25000000L
+  // we catch this case so we don't underrun by subtraction
+  if (us == 0) return;           // 3 cycles (.1us) on false, which we ignore
+
   // the following loop takes a 1/5 of a microsecond (5 cycles)
-  // per iteration, so execute it six times for each microsecond of
+  // per iteration, so execute it five times for each microsecond of
   // delay requested.
   us = (us << 2) + us; // x5 us, = 7 cycles
 
@@ -240,6 +472,9 @@ void delayMicroseconds(unsigned int us)
 #elif F_CPU >= 24000000L
   // for the 24 MHz external clock if someone is working with USB
 
+  // we catch this case so we don't underrun by subtraction
+  if (us == 0) return;           // 3 cycles (.1us) on false, which we ignore
+
   // the following loop takes a 1/6 of a microsecond (4 cycles)
   // per iteration, so execute it six times for each microsecond of
   // delay requested.
@@ -250,7 +485,38 @@ void delayMicroseconds(unsigned int us)
   // us is at least 6 so we can substract 5
   us -= 5; // = 2 cycles
 
-// #elif F_CPU >= 22118400L
+#elif F_CPU >= 22118400L
+  // this is basically the same as for 11.0592, except multiplying by 6, not 3.
+  // the correction factor is the same, but the multiply takes 4 cycles longer.
+
+  // the overhead of the function call is 14 (16) cycles which is ~2/3 us
+  if (us <= 1) return; // = 3 cycles, (4 when true)
+
+  us *= 6; // x6 us, = 9 cycles [{ us = (us<<2)+(us<<1); = 9 cycles too }]
+
+                       // +1 cycle (register save)
+  if (us > 16) // = 3 cycles
+  {
+    // since the loop is not accurately 1/6 of a microsecond we need
+    // to multiply us by 0.9216 (11.0592 / 12 = 22.1184 / 24)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    // this drops us to at least 15
+
+    // account for the time taken in the preceeding commands.
+    // we just burned 57 (59) cycles above, remove 14 (14*4=56),
+    // us is at least 15 so we may subtract 14 alright
+    us -= 14; // = 2 cycles
+  }
+  else
+  {
+    // account for the time taken in the preceeding commands.
+    // we just burned 31 (33) cycles above, remove 8 (8*4=32),
+    // user wants to wait at least 2 us, after multiply us >= 12
+
+             // 1 cycle when if jump here
+    us -= 8; // 2 cycles
+             // 2 cycles to jump back to delay cycle.
+  }
 
 #elif F_CPU >= 20000000L
   __asm__ __volatile__ (
@@ -290,17 +556,18 @@ void delayMicroseconds(unsigned int us)
 
                        // +1 cycle (register save)
   // user wants to wait longer than 3 us
-  if (us > 15) // = 3 cycles
+  if (us > 17) // = 3 cycles
   {
     // Since the loop is not accurately 1/5 of a microsecond we need
     // to multiply us by 0.9216 (18.432 / 20)
-    us = (us * 60398L) >> 16;   // x0.9216 us = 29 cycles (60398 = 0.9216 * 0x10000L)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 * 0x10000L)
+    // this drops us to at least 16
 
     // account for the time taken in the preceeding commands.
     // we just burned 59 (61) cycles above, remove 15, (15*4=60)
     us -= 15; // = 2 cycles
   }
-  else 
+  else
   {
     // account for the time taken in the preceeding commands.
     // we just burned 33 (35) cycles above, remove 9, (9*4=36)
@@ -310,6 +577,42 @@ void delayMicroseconds(unsigned int us)
              // 2 cycles to jump back to delay cycle.
   }
 
+#elif F_CPU >= 18000000L
+  // for the 18 MHz clock, if somebody is working with USB
+  // or otherwise relating to 12 or 24 MHz clocks
+
+  // for a 1 microsecond delay, simply return.  the overhead
+  // of the function call takes 14 (16) cycles, which is .8 us
+  if (us <= 1) return; // = 3 cycles, (4 when true)
+
+  // make the loop below last 6 cycles
+#undef  _MORENOP_
+#define _MORENOP_ " nop \n\t  nop \n\t"
+
+  // the following loop takes 1/3 of a microsecond (6 cycles) per iteration,
+  // so execute it three times for each microsecond of delay requested.
+  us = (us << 1) + us; // x3 us, = 5 cycles
+
+  // account for the time taken in the preceeding commands.
+  // we just burned 20 (22) cycles above, remove 3 (3*6=18),
+  // us is at least 6 so we may subtract 3
+  us -= 3; // = 2 cycles
+
+#elif F_CPU >= 16500000L
+  // for a one-microsecond delay, simply return.  the overhead
+  // of the function call takes 14 (16) cycles, which is about 1us
+  if (us <= 1) return; //  = 3 cycles, (4 when true)
+
+  // the following loop takes 1/4 of a microsecond (4 cycles) times 32./33.
+  // per iteration, thus rescale us by 4. * 33. / 32. = 4.125 to compensate
+  us = (us << 2) + (us >> 3); // x4.125 with 22 cycles
+
+  // account for the time taken in the preceding commands.
+  // we burned 37 (39) cycles above, plus 2 below, remove 10 (4*10=40)
+  // us is at least 8, so we subtract only 7 to keep it positive
+  // the error is below one microsecond and not worth extra code
+  us -= 7; // = 2 cycles
+
 #elif F_CPU >= 16000000L
   // for a one-microsecond delay, simply return.  the overhead
   // of the function call takes 14 (16) cycles, which is 1 us
@@ -337,7 +640,8 @@ void delayMicroseconds(unsigned int us)
   {
     // Since the loop is not accurately 1/4 of a microsecond we need
     // to multiply us by 0.9216 (14.7456 / 16)
-    us = (us * 60398L) >> 16;   // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    // this drops us to at least 14
 
     // account for the time taken in the preceeding commands.
     // we just burned 53 (57) cycles above, remove 13, (13*4=52)
@@ -377,12 +681,13 @@ void delayMicroseconds(unsigned int us)
   us = (us << 1) + us; // x3 us, = 5 cycles
 
                        // +1 cycle (register save)
-  // user wants to wait longer than 4 us
-  if (us > 14) // = 3 cycles
+  // user wants to wait longer than 5 us
+  if (us > 15) // = 3 cycles
   {
     // since the loop is not accurately 1/3 of a microsecond we need
     // to multiply us by 0.9216 (11.0592 / 12)
-    us = (us * 60398L) >> 16;   // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    // this drops us to at least 14
 
     // account for the time taken in the preceeding commands.
     // we just burned 53 (55) cycles above, remove 13, (13*4=52)
@@ -391,7 +696,8 @@ void delayMicroseconds(unsigned int us)
   else
   {
     // account for the time taken in the preceeding commands.
-    // we just burned 27 (29) cycles above, remove 7, (7*4=28)
+    // we just burned 27 (29) cycles above, remove 7 (7*4=28),
+    // us is at least 9, so we may subtract without rollunder
 
              // 1 cycle when if jump here
     us -= 7; // 2 cycles
@@ -406,41 +712,47 @@ void delayMicroseconds(unsigned int us)
   if (us <= 2) return; // = 3 cycles, (4 when true)
 
   // the following loop takes 2/5 of a microsecond (4 cycles)
-  // per iteration, so execute it three times for each microsecond of
-  // delay requested.
+  // per iteration, so execute it five times for every 2 microseconds
+  // of delay requested.
   us = (us << 1) + (us >> 1); // x2.5 us, = 7 cycles
 
   // account for the time taken in the preceeding commands.
-  // we just burned 22 (24) cycles above, remove 5, (5*4=20)
-  // us is at least 20 so we can substract 5
-  us -= 5; // = 2 cycles
+  // we burn 22 (24) cycles above plus 2 below, remove 6, (6*4=24)
+  // us is at least 7 so we can subtract 6
+  us -= 6; // = 2 cycles
 
 #elif F_CPU >= 9216000L
   // the overhead of the function call is 14 (16) cycles which is ~1.5 us
-  if (us <= 3) return; // = 3 cycles, (4 when true)
+  if (us <= 2) return; // = 3 cycles, (4 when true)
 
-  us = (us << 2) + us ; // x2.5x2 us, = 7 cycles
+  // factor of 10 in multiplying by 2 and making the loop last 5 cycles
+  us <<= 1; // x2 us, = 2 cycles
+
+  // make the delay loop last 5 cycles
+#undef  _MORENOP_
+#define _MORENOP_ " nop \n\t"
 
                        // +1 cycle (register save)
-  // user wants to wait longer than 6 us
-  if (us > 30) // = 3 cycles
+  // user wants to wait longer than 5 us
+  if (us > 11) // = 3 cycles
   {
-    // since the loop is not accurately 2/5 of a microsecond we need
+    // since the loop is not accurately 1/2 of a microsecond we need
     // to multiply us by 0.9216 (11.0592 / 12)
-    us = (us * 30199L) >> 16;   // x(0.9216/2) us = 29 cycles (30199 = 0.4608 x 0x10000L)
+    us = (us * 60398UL) >> 16;   // x(0.9216) us = 29 cycles
+    // this drops us to at least 11
 
     // account for the time taken in the preceeding commands.
-    // we just burned 53 (55) cycles above, remove 13, (13*4=52)
-    us -= 13; // = 2 cycles
+    // we just burned 48 (50) cycles above, remove 10 (10*5=50)
+    us -= 10; // = 2 cycles
   }
   else
   {
     // account for the time taken in the preceeding commands.
-    // we just burned 31 (33) cycles above, remove 8, (8*4=32)
+    // we just burned 26 (28) cycles above, remove 5 (5*5=25)
+    // us is at least 6 so we may subtract 5
 
               // 1 cycle when if jump here
-    us >>= 1; // 2 cycles restore x2.5 us
-    us -=  8; // 2 cycles
+    us -= 5;  // 2 cycles
               // 2 cycles to jump back to delay cycle.
   }
 
@@ -479,7 +791,8 @@ void delayMicroseconds(unsigned int us)
   {
     // since the loop is not accurately 1/2 of a microsecond we need
     // to multiply us by 0.9216 (7.3728 / 8)
-    us = (us * 60398L) >> 16;   // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    // this drops us to at least 14
 
     // account for the time taken in the preceeding commands.
     // we just burned 52 (54) cycles above, remove 13, (13*4=52)
@@ -496,6 +809,20 @@ void delayMicroseconds(unsigned int us)
              // 2 cycles to jump back to delay cycle.
   }
 
+#elif F_CPU >= 6000000L
+  // for a 1 to 3 microsecond delay, simply return.  the overhead
+  // of the function call takes 14 (16) cycles, which is 2.5us
+  if (us <= 3) return; //  = 3 cycles, (4 when true)
+
+  // make the loop below last 6 cycles
+#undef  _MORENOP_
+#define _MORENOP_ " nop \n\t  nop \n\t"
+
+  // the following loop takes 1 microsecond (6 cycles) per iteration
+  // we burned 15 (17) cycles above, plus 2 below, remove 3 (3 * 6 = 18)
+  // us is at least 4 so we can subtract 3
+  us -= 3; // = 2 cycles
+
 #elif F_CPU >= 4000000L
   __asm__ __volatile__ ("nop"); // just waiting 1 cycle
   // the overhead of the function call is 15 (17) cycles which is 4 us
@@ -511,13 +838,16 @@ void delayMicroseconds(unsigned int us)
   // of the function call takes 14 (16) cycles, which is almost 4 us
   if (us <= 6) return; // = 3 cycles, (4 when true)
 
+                       // Question:
+                       // Are we certain that there is a register save?
                        // +1 cycle (register save)
-  // user wants to wait longer than 12 us
-  if (us > 12) // = 3 cycles
+  // user wants to wait longer than 14 us
+  if (us > 14) // = 3 cycles
   {
     // since the loop is not accurately 1 microsecond we need
     // to multiply us by 0.9216 ( = 3.6864 / 4)
-    us = (us * 60398L) >> 16;   // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    us = (us * 60398UL) >> 16;  // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L)
+    // this drops us to at least 13
 
     // account for the time taken in the preceeding commands.
     // we just burned 47 (49) cycles above, remove 12, (12*4=48)
@@ -551,15 +881,16 @@ void delayMicroseconds(unsigned int us)
 #elif F_CPU >= 1843200L
   // for less than 13 microsecond delay, simply return. the overhead
   // of the function call takes 14 (16) cycles, which is almost 8 us
-  if (us <= 12) return; // = 3 cycles, (4 when true)
+  if (us <= 13) return; // = 3 cycles, (4 when true)
 
                         // no register save here
-  // user wants to wait longer than 25 us
-  if (us > 25) // = 3 cycles
+  // user wants to wait longer than 54 us
+  if (us > 54) // = 3 cycles
   {
     // since the loop takes ~2.17 microseconds we need
     // to multiply us by 0.4608 ( = 1.8432 / 2 / 2 )
     us = (us * 30199L) >> 16;   // x(0.9216/2) us = 29 cycles (30199 = 0.4608 x 0x10000L)
+    // this drops us to at least 25
 
     // account for the time taken in the preceeding commands.
     // we just burned 47 (49) cycles above, remove 24, microseconds
@@ -572,6 +903,7 @@ void delayMicroseconds(unsigned int us)
 
               // 1 cycle when if jump here
     us -= 12; // 2 cycles
+    // this drops us to at least 2 and we divide by 2 below
 
     us >>= 1; // division by 2 = 2 cycles
               // 2 cycles to jump back to delay cycle.
@@ -591,13 +923,14 @@ void delayMicroseconds(unsigned int us)
   // us is at least 4, divided by 4 gives us 1 (no zero delay bug)
   us >>= 2; // us div 4, = 4 cycles
 
-
 #endif
 
   // busy wait
   __asm__ __volatile__ (
     "1: sbiw %0,1" "\n\t"            // 2 cycles
-        _MORENOP_                    // 4 cycles if 32 MHz or 1 cycle if 25 MHz
+        _MORENOP_                    // 4 cycles if 32 MHz or
+                                     // 1 cycle  if 25, 9.216
+                                     // 2 cycles if 18, 6 MHz
     "   brne 1b"                     // 2 cycles
     : /* no outputs */
     : "w" (us)
@@ -698,9 +1031,9 @@ void init()
 
 #if defined(TCCR4A) && defined(TCCR4B) && defined(TCCR4D)
   TCCR4B |= _BV(CS42) | _BV(CS41) | _BV(CS40); // Set timer 4 prescale factor to 64
-  TCCR4D |= _BV(WGM40);                        // Put timer 4 in phase- and frequency-correct PWM mode 
+  TCCR4D |= _BV(WGM40);                        // Put timer 4 in phase- and frequency-correct PWM mode
   TCCR4A |= _BV(PWM4A);                        // Enable PWM mode for comparator OCR4A
-  TCCR4C |= _BV(PWM4D);                        // Enable PWM mode for comparator OCR4D 
+  TCCR4C |= _BV(PWM4D);                        // Enable PWM mode for comparator OCR4D
 #elif defined(TCCR4B) && defined(CS41) && defined(WGM40)
   TCCR4B |= _BV(CS41) | _BV(CS40); // Set timer 4 prescale factor to 64
   TCCR4A |= _BV(WGM40);            // Put timer 4 in 8-bit phase correct pwm mode
diff --git a/wiring_analog.c b/wiring_analog.c
index 3e609533b..640100316 100755
--- a/wiring_analog.c
+++ b/wiring_analog.c
@@ -77,7 +77,7 @@ int analogRead(uint8_t pin)
   ADCSRA |= _BV(ADSC);
 
   // ADSC is cleared when the conversion finishes
-  while (bit_is_set(ADCSRA, ADSC));
+  while (ADCSRA & _BV(ADSC));
 
   // we have to read ADCL first; doing so locks both ADCL
   // and ADCH until ADCH is read.  reading ADCL second would