diff --git a/avr/cores/MCUdude_corefiles/Arduino.h b/avr/cores/MCUdude_corefiles/Arduino.h index 78eeb8a2..fe006267 100755 --- a/avr/cores/MCUdude_corefiles/Arduino.h +++ b/avr/cores/MCUdude_corefiles/Arduino.h @@ -121,11 +121,11 @@ void yield(void); #undef abs #endif -#define abs(x) __builtin_abs(x) +#define abs(x) ({ typeof (x) _x = (x); _x > 0 ? _x : -x; }) #define sq(x) ({ typeof (x) _x = (x); _x * _x; }) -#define min(a,b) ({ typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; }) -#define max(a,b) ({ typeof (a) _a = (a); typeof (b) _b = (b); _a > _b ? _a : _b; }) -#define round(x) ({ typeof (x) _x = (x); _x >= 0 ? (long)_x + 0.5 : (long)_x - 0.5; }) +#define min(a,b) ({ typeof (a) _a = (a); typeof (b) _b = (b); _a < _b ? _a : _b; }) +#define max(a,b) ({ typeof (a) _a = (a); typeof (b) _b = (b); _a > _b ? _a : _b; }) +#define round(x) ({ typeof (x) _x = (x); _x >= 0 ? (long)(_x + 0.5) : (long)(_x - 0.5); }) #define radians(deg) ((deg) * DEG_TO_RAD) #define degrees(rad) ((rad) * RAD_TO_DEG) #define constrain(x,low,high) ({ \ diff --git a/avr/cores/MCUdude_corefiles/README.md b/avr/cores/MCUdude_corefiles/README.md index 71e81674..8c68f6da 100644 --- a/avr/cores/MCUdude_corefiles/README.md +++ b/avr/cores/MCUdude_corefiles/README.md @@ -3,7 +3,9 @@ This repo contains the Arduino corefiles used with [MightyCore](https://github.com/MCUdude/MightyCore), [MegaCore](https://github.com/MCUdude/MegaCore), [MiniCore](https://github.com/MCUdude/MiniCore) and [MajorCore](https://github.com/MCUdude/MightyCore). + ## Supported devices + * ATmega640, ATmega1280, ATmega2560 * ATmega64, ATmega128, ATmega1281, ATmega2561 * AT90CAN32, AT90CAN64, AT90CAN128 @@ -11,20 +13,92 @@ This repo contains the Arduino corefiles used with [MightyCore](https://github.c * ATmega8515, ATmega162 * ATmega8, ATmega48/P/PA/PB, ATmega88/P/PA/PB, ATmega168/P/PA/PB, ATmega328/P/PA/PB + ## Supported clock frequencies -By supported I mean clocks that accurate timing is implemented for (millis, micros, delay, delayMicroseconds). + +By supported I mean clocks that accurate timing is implemented for (millis, +micros, delay, delayMicroseconds). + * 32 MHz +* 25 MHz * 24 MHz +* 22.1184 MHz * 20 MHz * 18.432 MHz +* 18 MHz +* 16.5 MHz * 16 MHz * 14.7456 MHz * 12 MHz * 11.0592 MHz +* 10 MHz +* 9.216 MHz * 8 MHz * 7.3728 MHz +* 6 MHz * 4 MHz * 3.6864 MHz * 2 MHz * 1.8432 MHz * 1 MHz + + +### Adding further clock frequencies + +The calculation of `millis()`, `micros()` and `delay()` is automatic for +arbitrary frequencies. +Depending on the prime factors of the frequency, it is either exact or +approximate to 60 ppm accuracy (worst-case). +The only thing required is adding support in `delayMicroseconds()`. + + +### Exactness of `delayMicroseconds()` + +The `delayMicroseconds(unsigned int us)` implementation is exact up to a few +cycles for the frequencies listed above. + +The maximum input parameter to work reliably is 10000 for 10 milliseconds. +Its result is affected by interrupts occurring, which may prolong the delay. + + +### Exactness of `micros()` and `delay()` + +For the clock speeds listed above, `micros()` is corrected to zero drift. +Even for very long run times, the `micros()` function will precisely follow the +oscillator used. + +Frequencies not listed above are either exact or corrected to below 60 ppm drift +and in exact sync with `millis()`. + +Note that the result of `micros()` may jump up by several microseconds between +consecutive calls and rolls over after one hour and eleven minutes. + +The `delay()` function uses `micros()` internally and inherits its drift accuracy +with slight variations due to function call overhead and processing. +It is immune to interrupts and thus long-term accurate. + + +### Exactness of `millis()` + +For the clock speeds listed above, `millis()` is corrected to zero drift. +Even for very long run times, the `millis()` function will precisely follow the +oscillator used. + +Frequencies not listed above are either exact or corrected to below 60 ppm drift +and in exact sync with `micros()` and `delay()`. + +We do not register the rollover of the `unsigned long` millis counter that +occurs every 49.7 days; such would have to be done in the user's program. +Often this is not necessary: The code + + if (millis() - millis_old >= interval) { + /* do something */ + millis_old += interval; + } + +is long-term accurate even when rolling over provided `millis_old` is of type +`unsigned long`. + +For clock speeds of 16 MHz and below, the return value of `millis()` +occasionally jumps up by more than one (notwithstanding low/zero drift). +Thus, when relying on consecutive returns, run at 16.5 MHz or higher. diff --git a/avr/cores/MCUdude_corefiles/wiring.c b/avr/cores/MCUdude_corefiles/wiring.c index 6b0df7f7..9d2bf2ec 100755 --- a/avr/cores/MCUdude_corefiles/wiring.c +++ b/avr/cores/MCUdude_corefiles/wiring.c @@ -24,29 +24,131 @@ // the prescaler is set so that timer0 ticks every 64 clock cycles, and the // the overflow handler is called every 256 ticks. -// 24MHz: An overflow happens every 682.67 microseconds ---> 0.04167, so this results in 682 +// 24MHz: An overflow happens every 682.67 microseconds ---> 0.04167, so this results in 682 // 20MHz: An overflow happens every 819.2 microseconds ---> 0,05 (time of a cycle in micros) * 64 (timer0 tick) * 256 (every 256 ticks timer0 overflows), so this results in 819 // 16MHz: An overflow happens every 1024 microseconds +#if 0 +// this would be inaccurate for non-power-of-two frequencies #define MICROSECONDS_PER_TIMER0_OVERFLOW (clockCyclesToMicroseconds(64 * 256)) +#else +// It is vital to avoid unnecessary roundoff in this calculation. +// What we really want to compute is the number of microseconds in one +// timer cycle, thus 64 * 256 * 1e6 / F_CPU. When calculating with integers, +// the product 64 * 256 * 1000**2 overflows an unsigned long. We resolve this +// by recognizing that F_CPU is evenly divisible by 10 in all cases. Thus, we +// cancel a factor of 10 on both sides, which allows us to use unsigned long. +// It turns out that code runs faster when the number is explicitly unsigned! +#define MICROSECONDS_PER_TIMER0_OVERFLOW \ + (64UL * 256UL * 100000UL / ((F_CPU + 5UL) / 10UL)) +#endif // the whole number of milliseconds per timer0 overflow // For 20MHz this would be 0 (because of 819) // For 16MHz this would be 1 (because of 1024) -#define MILLIS_INC (MICROSECONDS_PER_TIMER0_OVERFLOW / 1000) +#define MILLIS_INC (MICROSECONDS_PER_TIMER0_OVERFLOW / 1000U) // the fractional number of milliseconds per timer0 overflow. we shift right // by three to fit these numbers into a byte. (for the clock speeds we care // about - 8 and 16 MHz - this doesn't lose precision.) // For 16 MHz: 24 (1024 % 1000) gets shifted right by 3 which results in 3 (precision was lost) // For 20 MHz: 819 (819 % 1000) gets shifted right by 3 which results in 102 (precision was lost) -// For 24 MHz: 682 (682 % 1000) gets shifted right by 3 which results in -#define FRACT_INC ((MICROSECONDS_PER_TIMER0_OVERFLOW % 1000) >> 3) +// For 24 MHz: 682 (682 % 1000) gets shifted right by 3 which results in +#define FRACT_INC ((MICROSECONDS_PER_TIMER0_OVERFLOW % 1000U) >> 3) // Shift right by 3 to fit in a byte (results in 125) -#define FRACT_MAX (1000 >> 3) +#define FRACT_MAX (1000U >> 3) -volatile unsigned long timer0_overflow_count = 0; volatile unsigned long timer0_millis = 0; -static unsigned char timer0_fract = 0; +volatile unsigned char timer0_fract = 0; + +// Add a correction calculation to make millis () exact for most clocks. +// The idea is to compare the exact microseconds/8 between overflows, +// namely (1. / F_CPU * 64. * 256. * 1e6) % 1000 / 8., +// with the integer rounded down version in FRACT_INC. +// For the clock speeds examined below, we encounter four different cases. +// The low case: FRACT_INC is too low by a fraction 1 / n. +// Correct by adding 1 to the fract counter every n times. +// The high case: FRACT_INC is too low by a fraction (n - 1) / n. +// Add 1 to the fract counter always except every n times. +// A special case for 20 MHz: FRACT_INC is too low by the fraction 2. / 5. +// Correct by adding 2 out of 5 times: every odd number in 0..4. +// A special case for 11.0592 MHz: FRACT_INC is too low by 5. / 27. +// Correct brute force by counting 5 out of 27. +// Do it the same way for the remaining odd cases. +// This way we correct losses from both the rounding to usecs and the shift. +// For the remaining non-exact cases, we use a highly accurate approximation. +// This happens to be exact, too, for leftover UART-related frequencies. +#define FRACT_INC_PLUS +#define EXACT_NUM (64UL * 256UL * 125UL * 100UL) +#define EXACT_DEN ((F_CPU + 5UL) / 10UL) +#define EXACT_REM (EXACT_NUM - (EXACT_NUM / EXACT_DEN) * EXACT_DEN) +#if EXACT_REM > 0 || MICROSECONDS_PER_TIMER0_OVERFLOW % 256 > 0 // correct +#define CORRECT_EXACT_MILLIS +#define CORRECT_EXACT_MICROS +#if F_CPU == 25000000L // for 25 MHz we get 81.92, off by 23./25. +#define CORRECT_BRUTE 23 +#define CORRECT_ROLL 25 +#elif F_CPU == 24000000L // for 24 MHz we get 85.33, off by 1./3. +#define CORRECT_LO +#define CORRECT_ROLL 3 +#elif F_CPU == 22118400L // for 22.1184 MHz we get 92 + 16./27. +#define CORRECT_BRUTE 16 +#define CORRECT_ROLL 27 +#elif F_CPU == 20000000L // for 20 MHz we get 102.4, off by 2./5. +#define CORRECT_ODD +#define CORRECT_ROLL 5 +#elif F_CPU == 18432000L // for 18.432 MHz we get 111.11, off by 1./9. +#define CORRECT_LO +#define CORRECT_ROLL 9 +#elif F_CPU == 18000000L // for 18 MHz we get 113.78, off by 7./9. +#define CORRECT_BRUTE 7 +#define CORRECT_ROLL 9 +#elif F_CPU == 16500000L // for 16.5 MHz we get 124 + 4./33. +#define CORRECT_BRUTE 4 +#define CORRECT_ROLL 33 +#elif F_CPU == 14745600L // for 14.7456 MHz we get 13.89, off by 8./9. +#define CORRECT_HI +#define CORRECT_ROLL 9 +#elif F_CPU == 12000000L // for 12 MHz we get 45.67, off by 2./3. +#define CORRECT_HI +#define CORRECT_ROLL 3 +#elif F_CPU == 11059200L // for 11.0592 MHz we get 60 + 5./27. +#define CORRECT_BRUTE 5 +#define CORRECT_ROLL 27 +#elif F_CPU == 10000000L // for 10 MHz we get 79.8, off by 4./5. +#define CORRECT_HI +#define CORRECT_ROLL 5 +#elif F_CPU == 9216000L // for 9.216 MHz we get 97. + 2./9. +#define CORRECT_BRUTE 2 +#define CORRECT_ROLL 9 +#elif F_CPU == 7372800L // for 7.3728 MHz we get 27 + 7./9. +#define CORRECT_BRUTE 7 +#define CORRECT_ROLL 9 +#elif F_CPU == 6000000L // for 6 MHz we get 91 + 1./3. +#define CORRECT_LO +#define CORRECT_ROLL 3 +#elif F_CPU == 3686400L // for 3.6864 MHz we get 55 + 5./9. +#define CORRECT_BRUTE 5 +#define CORRECT_ROLL 9 +#elif F_CPU == 1843200L // for 1.8432 MHz we get 111.11, off by 1./9. +#define CORRECT_LO +#define CORRECT_ROLL 9 +#else // fallback accurate to better than 60 ppm +#define CORRECT_BRUTE ((2U * 135U * EXACT_REM + EXACT_DEN) / (2U * EXACT_DEN)) +#define CORRECT_ROLL 135 +#if CORRECT_BRUTE <= 0 +#undef CORRECT_EXACT_MILLIS // low corner case amounts to nothing +#elif CORRECT_BRUTE >= CORRECT_ROLL +#undef CORRECT_EXACT_MILLIS +#undef FRACT_INC_PLUS +#define FRACT_INC_PLUS + 1 // high corner case always adds one extra +#endif +#endif // fallback +#endif // EXACT_REM > 0 + +#ifndef CORRECT_EXACT_MICROS +// variable is only needed in micros() calculation without exactness correction +volatile unsigned long timer0_overflow_count = 0; +#endif // timer0 interrupt routine ,- is called every time timer0 overflows #if defined(__AVR_ATtiny24__) || defined(__AVR_ATtiny44__) || defined(__AVR_ATtiny84__) @@ -55,21 +157,59 @@ ISR(TIM0_OVF_vect) ISR(TIMER0_OVF_vect) #endif { +#ifdef CORRECT_EXACT_MILLIS + // this is a variable that retains its value between calls + static unsigned char timer0_exact = 0; +#endif + // copy these to local variables so they can be stored in registers // (volatile variables must be read from memory on every access, so this saves time) unsigned long m = timer0_millis; unsigned char f = timer0_fract; - m += MILLIS_INC; - f += FRACT_INC; + f += FRACT_INC FRACT_INC_PLUS; + +#ifdef CORRECT_EXACT_MILLIS + // correct millis () to be exact for certain clocks + if (timer0_exact == CORRECT_ROLL - 1) { + timer0_exact = 0; +#ifdef CORRECT_LO + ++f; +#endif + } + else { + ++timer0_exact; +#ifdef CORRECT_HI + ++f; +#endif + } + // it does not matter for the long-time drift whether the following two + // corrections take place before or after the increment of timer0_exact +#ifdef CORRECT_ODD + if (timer0_exact & 1) { + ++f; + } +#endif +#ifdef CORRECT_BRUTE + if (timer0_exact < CORRECT_BRUTE) { + ++f; + } +#endif +#endif // CORRECT_EXACT_MILLIS + if (f >= FRACT_MAX) { f -= FRACT_MAX; - m += 1; + m += MILLIS_INC + 1; + } + else { + m += MILLIS_INC; } timer0_fract = f; timer0_millis = m; +#ifndef CORRECT_EXACT_MICROS timer0_overflow_count++; +#endif } unsigned long millis() @@ -88,13 +228,24 @@ unsigned long millis() unsigned long micros() { unsigned long m; - uint8_t oldSREG = SREG; +#ifdef CORRECT_EXACT_MICROS + unsigned char f; // temporary storage for millis fraction counter + unsigned char q = 0; // record whether an overflow is flagged +#endif // t will be the number where the timer0 counter stopped uint8_t t; + uint8_t oldSREG = SREG; // Stop all interrupts cli(); + +#ifdef CORRECT_EXACT_MICROS + // combine exact millisec and 8usec counters + m = timer0_millis; + f = timer0_fract; +#else m = timer0_overflow_count; +#endif // TCNT0 : The Timer Counter Register #if defined(TCNT0) @@ -108,77 +259,148 @@ unsigned long micros() { // Timer0 Interrupt Flag Register #ifdef TIFR0 if ((TIFR0 & _BV(TOV0)) && (t < 255)) +#ifndef CORRECT_EXACT_MICROS m++; +#else + q = 1; +#endif #else if ((TIFR & _BV(TOV0)) && (t < 255)) +#ifndef CORRECT_EXACT_MICROS m++; +#else + q = 1; +#endif #endif // Restore SREG SREG = oldSREG; -#if F_CPU >= 24000000L && F_CPU < 32000000L +#ifdef CORRECT_EXACT_MICROS + /* We convert milliseconds, fractional part and timer value + into a microsecond value. Relies on CORRECT_EXACT_MILLIS. + Basically we multiply by 1000 and add the scaled timer. + + The leading part by m and f is long-term accurate. + For the timer we just need to be close from below. + Must never be too high, or micros jumps backwards. */ + m = (((m << 7) - (m << 1) - m + f) << 3) + + ((t * MICROSECONDS_PER_TIMER0_OVERFLOW) >> 8); + return q ? m + MICROSECONDS_PER_TIMER0_OVERFLOW : m; +#elif 1 + /* All power-of-two Megahertz frequencies enter here, as well as 12.8 MHz. + We only end up here if right shift before multiplication is exact. */ + return ((m << 8) + t) * (MICROSECONDS_PER_TIMER0_OVERFLOW >> 8); +#else +/* + * This is the old code requiring individual treatment for each frequency. + * It has the following accuracy for non-power-of-two MHz frequencies. + * + * 20 MHz has a drift of 1 in 65536 (~15 ppm) + * 18.432 Mhz has a drift of 1 in 64000 (~16 ppm) + * 25 MHz has a drift of 1 in 43691 (~23 ppm) + * 14.7456 MHz has a drift of 1 in 10000 (100 ppm) + * 7.3728 MHz has a drift of 1 in 10000 + * 3.6864 MHz has a drift of 1 in 10000 + * 1.8432 MHz has a drift of 1 in 10000 + * 24 MHz has a drift of 1 in 4096 (244 ppm) + * 18 MHz has a drift of 1 in 4096 + * 12 MHz has a drift of 1 in 4096 + * 22.1184 MHz has a drift of 1 in 2857 (350ppm) + * 11.0592 MHz has a drift of 1 in 2857 +*/ +#if F_CPU >= 32000000L + // we need to put this #if here to avoid entering the wrong branch for 32 MHz + return ((m << 8) + t) * (64 / clockCyclesPerMicrosecond()); +#elif F_CPU >= 25000000L + // m needs to be multiplied by 655.36 + // and t by 2.56 ~ 5243 / 2048. for an error of 1 in 43691 (23 ppm) + m = (m << 8) + t; + // How many shift adds does it take until long multiply becomes faster? + // Can we just return (m * 41943UL) >> 14 and be done to 1ppm accuracy. + return (m << 2) - m - (m >> 1) + (m >> 4) - (m >> 9) - (m >> 11); +#elif F_CPU >= 24000000L // m needs to be multiplied by 682.67 - // and t by 2.67 + // and t by 2.667 ~ 1365 / 512. for an error of 1 in 4096 (244 ppm) + m = (m << 8) + t; + m = (m << 1) + (m >> 1) + (m >> 3); + return m + (m >> 6); +#elif F_CPU >= 22118400L + // m needs to be multiplied by 740.74 + // and t by 2.894 ~ 741 / 256. for an error of 1 in 2857 (350 ppm) m = (m << 8) + t; - return (m << 1) + (m >> 1) + (m >> 3) + (m >> 4); // Multiply by 2.6875 + return m + (m << 1) - (m >> 3) + (m >> 6) + (m >> 8); #elif F_CPU >= 20000000L - // m needs to be multiplied by 819.2 - // t needs to be multiplied by 3.2 + // m needs to be multiplied by 819.2 + // and t by 16. / 5. = 3.2 ~ 819 / 256. for an error of 1 in 4096 m = (m << 8) + t; - return m + (m << 1) + (m >> 2) - (m >> 4); // Multiply by 3.1875 + m = (m << 2) - m; + // return m + (m >> 4) + (m >> 8); + // improve further to 3.19995 ~ 13107 / 4096. for an error of 15 ppm + m += (m >> 4); + return m + (m >> 8); #elif F_CPU >= 18432000L - // m needs to be multiplied by 888.88 - // and t by 3.47 + // m needs to be multiplied by 888.89 + // and t by 125. / 36. ~ 3.472 ~ 889. / 256. for an error of 1 in 8000 m = (m << 8) + t; - return m + (m << 1) + (m >> 1); // Multiply by 3.5 + // return (m << 2) - (m >> 1) - (m >> 5) + (m >> 8); + // improve further to 3.47217 ~ 7111. / 2048. for an error of 16 ppm + return (m << 2) - (m >> 1) - (m >> 5) + (m >> 8) - (m >> 11); +#elif F_CPU >= 18000000L + // m needs to be multiplied by 910.22 + // and t by 3.556 ~ 910. / 256. for an error of 1 in 4096 + m = (m << 8) + t; + m = (m << 2) - (m >> 1); + return m + (m >> 6); #elif F_CPU >= 14745600L && F_CPU != 16000000L - // m needs to be multiplied by 1111.1 - // and t by 4.34 + // m needs to be multiplied by 1111.11 + // and t by 4.34 ~ 1111. / 256. for an error of 100 ppm m = (m << 8) + t; - return (m << 2) + (m >> 1) - (m >> 3) - (m >> 4); // Multiply by 4.3125 + return (m << 2) + (m >> 1) - (m >> 3) - (m >> 5) - (m >> 8); #elif F_CPU >= 12000000L && F_CPU != 16000000L // m needs to be multiplied by 1365.33 - // and t by 5.33 + // and t by 5.33 ~ 1365. / 256. for an error of 1 in 4096 m = (m << 8) + t; - return m + (m << 2) + (m >> 2) + (m >> 3) - (m >> 4) + (m >> 5); // Multiply by 5.3437 + m += (m << 2) + (m >> 2); + return m + (m >> 6); #elif F_CPU >= 11059200L && F_CPU != 16000000L // m needs to be multiplied by 1481.48 - // and t by 5.78 + // and t by 5.789 ~ 1482. / 256. for an error of 1 in 2857 m = (m << 8) + t; - return (m << 2) + (m << 1) - (m >> 2) + (m >> 5); // Multiply by 5.78125 + return (m << 3) - (m << 1) - (m >> 2) + (m >> 5) + (m >> 7); #elif F_CPU == 7372800L // m needs to be multiplied by 2222.22 - // and t by 8.68 + // and t by 8.68 ~ 2222. / 256. for an error of 100 ppm m = (m << 8) + t; - return (m << 3) + m - (m >> 2) - (m >> 3); // Multiply by 8.625 + return (m << 3) + m - (m >> 2) - (m >> 4) - (m >> 7); #elif F_CPU == 3686400L // m needs to be multiplied by 4444.44 - // and t by 17.36 + // and t by 17.36 ~ 4444. / 256. for an error of 100 ppm m = (m << 8) + t; - return (m << 4) + m + (m >> 1) - (m >> 3) - (m >> 6); // Multiply by 17.359375 + return (m << 4) + (m << 1) - (m >> 1) - (m >> 3) - (m >> 6); #elif F_CPU == 1843200L // m needs to be multiplied by 8888.88 - // and t by 34.72 + // and t by 34.72 ~ 8888. / 256. for an error of 100 ppm m = (m << 8) + t; - return (m << 5) + (m << 1) + (m >> 1) + (m >> 2); // Multiply by 34.75 + return (m << 5) + (m << 2) - m - (m >> 2) - (m >> 5); #else // 32 MHz, 24 MHz, 16 MHz, 8 MHz, 4 MHz, 1 MHz - // Shift by 8 to the left (multiply by 256) so t (which is 1 byte in size) can fit in + // Shift by 8 to the left (multiply by 256) so t (which is 1 byte in size) can fit in // m & t are multiplied by 4 (since it was already multiplied by 256) // t is multiplied by 4 return ((m << 8) + t) * (64 / clockCyclesPerMicrosecond()); #endif +#endif // 0 } void delay(unsigned long ms) { - uint32_t start = micros(); + unsigned long start = micros(); - while (ms > 0) { + while (ms > 0UL) { yield(); - while ( ms > 0 && (micros() - start) >= 1000) { + while (ms > 0UL && (micros() - start) >= 1000UL) { ms--; - start += 1000; + start += 1000UL; } } } @@ -193,11 +415,15 @@ void delay(unsigned long ms) * In Arduino IDE 1.6.11 and newer LTO is enabled by default. The LTO optimizes the code * at link time, making the code (often) significantly smaller without making it "slower" * and sometimes destroy acccurate software timings like delayMicroseconds() with lower values. - * To avoid LTO optimization, the line of delayMicrosecons() definition in arduino.h must be replace to this: + * To avoid LTO optimization, the line of delayMicroseconds() definition in arduino.h must be replaced by this: * void delayMicroseconds(unsigned int) __attribute__ ((noinline)) ; */ void delayMicroseconds(unsigned int us) { + // Question: + // We multiply `us' by as much as 6 below. This reduces the available range of us. + // Updated README to define the safe calling range to 0 .. 10000 us. + // call = 4 cycles + 1 to 4 cycles to init us(2 for constant delay, 4 for variable, // 1 for register variable) @@ -206,6 +432,9 @@ void delayMicroseconds(unsigned int us) //delay_us(us); #if F_CPU >= 32000000L + // we catch this case so we don't underrun by subtraction + if (us == 0) return; // 3 cycles (.1us) on false, which we ignore + // the following loop takes a 1/4 of a microsecond (8 cycles with nops) // per iteration, so execute it four times for each microsecond of // delay requested. @@ -224,8 +453,11 @@ void delayMicroseconds(unsigned int us) // # elif F_CPU >= 29491200L #elif F_CPU >= 25000000L + // we catch this case so we don't underrun by subtraction + if (us == 0) return; // 3 cycles (.1us) on false, which we ignore + // the following loop takes a 1/5 of a microsecond (5 cycles) - // per iteration, so execute it six times for each microsecond of + // per iteration, so execute it five times for each microsecond of // delay requested. us = (us << 2) + us; // x5 us, = 7 cycles @@ -240,6 +472,9 @@ void delayMicroseconds(unsigned int us) #elif F_CPU >= 24000000L // for the 24 MHz external clock if someone is working with USB + // we catch this case so we don't underrun by subtraction + if (us == 0) return; // 3 cycles (.1us) on false, which we ignore + // the following loop takes a 1/6 of a microsecond (4 cycles) // per iteration, so execute it six times for each microsecond of // delay requested. @@ -250,7 +485,38 @@ void delayMicroseconds(unsigned int us) // us is at least 6 so we can substract 5 us -= 5; // = 2 cycles -// #elif F_CPU >= 22118400L +#elif F_CPU >= 22118400L + // this is basically the same as for 11.0592, except multiplying by 6, not 3. + // the correction factor is the same, but the multiply takes 4 cycles longer. + + // the overhead of the function call is 14 (16) cycles which is ~2/3 us + if (us <= 1) return; // = 3 cycles, (4 when true) + + us *= 6; // x6 us, = 9 cycles [{ us = (us<<2)+(us<<1); = 9 cycles too }] + + // +1 cycle (register save) + if (us > 16) // = 3 cycles + { + // since the loop is not accurately 1/6 of a microsecond we need + // to multiply us by 0.9216 (11.0592 / 12 = 22.1184 / 24) + us = (us * 60398UL) >> 16; // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L) + // this drops us to at least 15 + + // account for the time taken in the preceeding commands. + // we just burned 57 (59) cycles above, remove 14 (14*4=56), + // us is at least 15 so we may subtract 14 alright + us -= 14; // = 2 cycles + } + else + { + // account for the time taken in the preceeding commands. + // we just burned 31 (33) cycles above, remove 8 (8*4=32), + // user wants to wait at least 2 us, after multiply us >= 12 + + // 1 cycle when if jump here + us -= 8; // 2 cycles + // 2 cycles to jump back to delay cycle. + } #elif F_CPU >= 20000000L __asm__ __volatile__ ( @@ -290,17 +556,18 @@ void delayMicroseconds(unsigned int us) // +1 cycle (register save) // user wants to wait longer than 3 us - if (us > 15) // = 3 cycles + if (us > 17) // = 3 cycles { // Since the loop is not accurately 1/5 of a microsecond we need // to multiply us by 0.9216 (18.432 / 20) - us = (us * 60398L) >> 16; // x0.9216 us = 29 cycles (60398 = 0.9216 * 0x10000L) + us = (us * 60398UL) >> 16; // x0.9216 us = 29 cycles (60398 = 0.9216 * 0x10000L) + // this drops us to at least 16 // account for the time taken in the preceeding commands. // we just burned 59 (61) cycles above, remove 15, (15*4=60) us -= 15; // = 2 cycles } - else + else { // account for the time taken in the preceeding commands. // we just burned 33 (35) cycles above, remove 9, (9*4=36) @@ -310,6 +577,42 @@ void delayMicroseconds(unsigned int us) // 2 cycles to jump back to delay cycle. } +#elif F_CPU >= 18000000L + // for the 18 MHz clock, if somebody is working with USB + // or otherwise relating to 12 or 24 MHz clocks + + // for a 1 microsecond delay, simply return. the overhead + // of the function call takes 14 (16) cycles, which is .8 us + if (us <= 1) return; // = 3 cycles, (4 when true) + + // make the loop below last 6 cycles +#undef _MORENOP_ +#define _MORENOP_ " nop \n\t nop \n\t" + + // the following loop takes 1/3 of a microsecond (6 cycles) per iteration, + // so execute it three times for each microsecond of delay requested. + us = (us << 1) + us; // x3 us, = 5 cycles + + // account for the time taken in the preceeding commands. + // we just burned 20 (22) cycles above, remove 3 (3*6=18), + // us is at least 6 so we may subtract 3 + us -= 3; // = 2 cycles + +#elif F_CPU >= 16500000L + // for a one-microsecond delay, simply return. the overhead + // of the function call takes 14 (16) cycles, which is about 1us + if (us <= 1) return; // = 3 cycles, (4 when true) + + // the following loop takes 1/4 of a microsecond (4 cycles) times 32./33. + // per iteration, thus rescale us by 4. * 33. / 32. = 4.125 to compensate + us = (us << 2) + (us >> 3); // x4.125 with 22 cycles + + // account for the time taken in the preceding commands. + // we burned 37 (39) cycles above, plus 2 below, remove 10 (4*10=40) + // us is at least 8, so we subtract only 7 to keep it positive + // the error is below one microsecond and not worth extra code + us -= 7; // = 2 cycles + #elif F_CPU >= 16000000L // for a one-microsecond delay, simply return. the overhead // of the function call takes 14 (16) cycles, which is 1 us @@ -337,7 +640,8 @@ void delayMicroseconds(unsigned int us) { // Since the loop is not accurately 1/4 of a microsecond we need // to multiply us by 0.9216 (14.7456 / 16) - us = (us * 60398L) >> 16; // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L) + us = (us * 60398UL) >> 16; // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L) + // this drops us to at least 14 // account for the time taken in the preceeding commands. // we just burned 53 (57) cycles above, remove 13, (13*4=52) @@ -377,12 +681,13 @@ void delayMicroseconds(unsigned int us) us = (us << 1) + us; // x3 us, = 5 cycles // +1 cycle (register save) - // user wants to wait longer than 4 us - if (us > 14) // = 3 cycles + // user wants to wait longer than 5 us + if (us > 15) // = 3 cycles { // since the loop is not accurately 1/3 of a microsecond we need // to multiply us by 0.9216 (11.0592 / 12) - us = (us * 60398L) >> 16; // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L) + us = (us * 60398UL) >> 16; // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L) + // this drops us to at least 14 // account for the time taken in the preceeding commands. // we just burned 53 (55) cycles above, remove 13, (13*4=52) @@ -391,7 +696,8 @@ void delayMicroseconds(unsigned int us) else { // account for the time taken in the preceeding commands. - // we just burned 27 (29) cycles above, remove 7, (7*4=28) + // we just burned 27 (29) cycles above, remove 7 (7*4=28), + // us is at least 9, so we may subtract without rollunder // 1 cycle when if jump here us -= 7; // 2 cycles @@ -406,41 +712,47 @@ void delayMicroseconds(unsigned int us) if (us <= 2) return; // = 3 cycles, (4 when true) // the following loop takes 2/5 of a microsecond (4 cycles) - // per iteration, so execute it three times for each microsecond of - // delay requested. + // per iteration, so execute it five times for every 2 microseconds + // of delay requested. us = (us << 1) + (us >> 1); // x2.5 us, = 7 cycles // account for the time taken in the preceeding commands. - // we just burned 22 (24) cycles above, remove 5, (5*4=20) - // us is at least 20 so we can substract 5 - us -= 5; // = 2 cycles + // we burn 22 (24) cycles above plus 2 below, remove 6, (6*4=24) + // us is at least 7 so we can subtract 6 + us -= 6; // = 2 cycles #elif F_CPU >= 9216000L // the overhead of the function call is 14 (16) cycles which is ~1.5 us - if (us <= 3) return; // = 3 cycles, (4 when true) + if (us <= 2) return; // = 3 cycles, (4 when true) - us = (us << 2) + us ; // x2.5x2 us, = 7 cycles + // factor of 10 in multiplying by 2 and making the loop last 5 cycles + us <<= 1; // x2 us, = 2 cycles + + // make the delay loop last 5 cycles +#undef _MORENOP_ +#define _MORENOP_ " nop \n\t" // +1 cycle (register save) - // user wants to wait longer than 6 us - if (us > 30) // = 3 cycles + // user wants to wait longer than 5 us + if (us > 11) // = 3 cycles { - // since the loop is not accurately 2/5 of a microsecond we need + // since the loop is not accurately 1/2 of a microsecond we need // to multiply us by 0.9216 (11.0592 / 12) - us = (us * 30199L) >> 16; // x(0.9216/2) us = 29 cycles (30199 = 0.4608 x 0x10000L) + us = (us * 60398UL) >> 16; // x(0.9216) us = 29 cycles + // this drops us to at least 11 // account for the time taken in the preceeding commands. - // we just burned 53 (55) cycles above, remove 13, (13*4=52) - us -= 13; // = 2 cycles + // we just burned 48 (50) cycles above, remove 10 (10*5=50) + us -= 10; // = 2 cycles } else { // account for the time taken in the preceeding commands. - // we just burned 31 (33) cycles above, remove 8, (8*4=32) + // we just burned 26 (28) cycles above, remove 5 (5*5=25) + // us is at least 6 so we may subtract 5 // 1 cycle when if jump here - us >>= 1; // 2 cycles restore x2.5 us - us -= 8; // 2 cycles + us -= 5; // 2 cycles // 2 cycles to jump back to delay cycle. } @@ -479,7 +791,8 @@ void delayMicroseconds(unsigned int us) { // since the loop is not accurately 1/2 of a microsecond we need // to multiply us by 0.9216 (7.3728 / 8) - us = (us * 60398L) >> 16; // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L) + us = (us * 60398UL) >> 16; // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L) + // this drops us to at least 14 // account for the time taken in the preceeding commands. // we just burned 52 (54) cycles above, remove 13, (13*4=52) @@ -496,6 +809,20 @@ void delayMicroseconds(unsigned int us) // 2 cycles to jump back to delay cycle. } +#elif F_CPU >= 6000000L + // for a 1 to 3 microsecond delay, simply return. the overhead + // of the function call takes 14 (16) cycles, which is 2.5us + if (us <= 3) return; // = 3 cycles, (4 when true) + + // make the loop below last 6 cycles +#undef _MORENOP_ +#define _MORENOP_ " nop \n\t nop \n\t" + + // the following loop takes 1 microsecond (6 cycles) per iteration + // we burned 15 (17) cycles above, plus 2 below, remove 3 (3 * 6 = 18) + // us is at least 4 so we can subtract 3 + us -= 3; // = 2 cycles + #elif F_CPU >= 4000000L __asm__ __volatile__ ("nop"); // just waiting 1 cycle // the overhead of the function call is 15 (17) cycles which is 4 us @@ -511,13 +838,16 @@ void delayMicroseconds(unsigned int us) // of the function call takes 14 (16) cycles, which is almost 4 us if (us <= 6) return; // = 3 cycles, (4 when true) + // Question: + // Are we certain that there is a register save? // +1 cycle (register save) - // user wants to wait longer than 12 us - if (us > 12) // = 3 cycles + // user wants to wait longer than 14 us + if (us > 14) // = 3 cycles { // since the loop is not accurately 1 microsecond we need // to multiply us by 0.9216 ( = 3.6864 / 4) - us = (us * 60398L) >> 16; // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L) + us = (us * 60398UL) >> 16; // x0.9216 us = 29 cycles (60398 = 0.9216 x 0x10000L) + // this drops us to at least 13 // account for the time taken in the preceeding commands. // we just burned 47 (49) cycles above, remove 12, (12*4=48) @@ -551,15 +881,16 @@ void delayMicroseconds(unsigned int us) #elif F_CPU >= 1843200L // for less than 13 microsecond delay, simply return. the overhead // of the function call takes 14 (16) cycles, which is almost 8 us - if (us <= 12) return; // = 3 cycles, (4 when true) + if (us <= 13) return; // = 3 cycles, (4 when true) // no register save here - // user wants to wait longer than 25 us - if (us > 25) // = 3 cycles + // user wants to wait longer than 54 us + if (us > 54) // = 3 cycles { // since the loop takes ~2.17 microseconds we need // to multiply us by 0.4608 ( = 1.8432 / 2 / 2 ) us = (us * 30199L) >> 16; // x(0.9216/2) us = 29 cycles (30199 = 0.4608 x 0x10000L) + // this drops us to at least 25 // account for the time taken in the preceeding commands. // we just burned 47 (49) cycles above, remove 24, microseconds @@ -572,6 +903,7 @@ void delayMicroseconds(unsigned int us) // 1 cycle when if jump here us -= 12; // 2 cycles + // this drops us to at least 2 and we divide by 2 below us >>= 1; // division by 2 = 2 cycles // 2 cycles to jump back to delay cycle. @@ -591,13 +923,14 @@ void delayMicroseconds(unsigned int us) // us is at least 4, divided by 4 gives us 1 (no zero delay bug) us >>= 2; // us div 4, = 4 cycles - #endif // busy wait __asm__ __volatile__ ( "1: sbiw %0,1" "\n\t" // 2 cycles - _MORENOP_ // 4 cycles if 32 MHz or 1 cycle if 25 MHz + _MORENOP_ // 4 cycles if 32 MHz or + // 1 cycle if 25, 9.216 + // 2 cycles if 18, 6 MHz " brne 1b" // 2 cycles : /* no outputs */ : "w" (us) @@ -698,9 +1031,9 @@ void init() #if defined(TCCR4A) && defined(TCCR4B) && defined(TCCR4D) TCCR4B |= _BV(CS42) | _BV(CS41) | _BV(CS40); // Set timer 4 prescale factor to 64 - TCCR4D |= _BV(WGM40); // Put timer 4 in phase- and frequency-correct PWM mode + TCCR4D |= _BV(WGM40); // Put timer 4 in phase- and frequency-correct PWM mode TCCR4A |= _BV(PWM4A); // Enable PWM mode for comparator OCR4A - TCCR4C |= _BV(PWM4D); // Enable PWM mode for comparator OCR4D + TCCR4C |= _BV(PWM4D); // Enable PWM mode for comparator OCR4D #elif defined(TCCR4B) && defined(CS41) && defined(WGM40) TCCR4B |= _BV(CS41) | _BV(CS40); // Set timer 4 prescale factor to 64 TCCR4A |= _BV(WGM40); // Put timer 4 in 8-bit phase correct pwm mode diff --git a/avr/cores/MCUdude_corefiles/wiring_analog.c b/avr/cores/MCUdude_corefiles/wiring_analog.c index 3e609533..64010031 100755 --- a/avr/cores/MCUdude_corefiles/wiring_analog.c +++ b/avr/cores/MCUdude_corefiles/wiring_analog.c @@ -77,7 +77,7 @@ int analogRead(uint8_t pin) ADCSRA |= _BV(ADSC); // ADSC is cleared when the conversion finishes - while (bit_is_set(ADCSRA, ADSC)); + while (ADCSRA & _BV(ADSC)); // we have to read ADCL first; doing so locks both ADCL // and ADCH until ADCH is read. reading ADCL second would