// $Id: CryptoPrimitives.h,v 1.1.10.3 2003/08/26 09:08:15 cssharp Exp $ /* tab:4 * "Copyright (c) 2000-2003 The Regents of the University of California. * All rights reserved. * * Permission to use, copy, modify, and distribute this software and its * documentation for any purpose, without fee, and without written agreement is * hereby granted, provided that the above copyright notice, the following * two paragraphs and the author appear in all copies of this software. * * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT * OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF THE UNIVERSITY OF * CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY * AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS * ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATION TO * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS." * * Copyright (c) 2002-2003 Intel Corporation * All rights reserved. * * This file is distributed under the terms in the attached INTEL-LICENSE * file. If you do not find these files, copies can be found by writing to * Intel Research Berkeley, 2150 Shattuck Avenue, Suite 1300, Berkeley, CA, * 94704. Attention: Intel License Inquiry. */ /* Authors: Naveen Sastry * Date: 10/24/02 */ // Look at the movw instruction to shave a few more cycles. // [probably only for the atmel 128's] /* * Performs a leftward rotation on 32 bits of data, 1 bit * at a time. * (2 + (n * 9)) cycles */ /** * @author Naveen Sastry */ #define rol32(a, n) ({ \ unsigned long num = (unsigned long)(a); \ unsigned char nsh = (unsigned char)(n); \ __asm__ __volatile__ ( \ "dec %0" "\n\t" \ "brmi L_%=" "\n\t" \ "L1_%=:" "\n\t" \ "clc" "\n\t" \ "sbrc %D1, 7" "\n\t" \ "sec" "\n\t" \ "rol %A1" "\n\t" \ "rol %B1" "\n\t" \ "rol %C1" "\n\t" \ "rol %D1" "\n\t" \ "dec %0" "\n\t" \ "brpl L1_%=" "\n\t" \ "L_%=:" "\n\t" \ : "=r" (nsh), "=r" (num) \ : "0" (nsh), "1" (num) \ ); \ a = num; \ }) /* * Performs a rightward rotation on 32 bits of data, 1 bit * at a time. * (2 + (n * 9)) cycles */ #define ror32(a, n) ({ \ unsigned long num = (unsigned long)(a); \ unsigned char nsh = (unsigned char)(n); \ __asm__ ( \ "dec %0" "\n\t" \ "brmi L_%=" "\n\t" \ "L1_%=:" "\n\t" \ "clc" "\n\t" \ "sbrc %A1, 0" "\n\t" \ "sec" "\n\t" \ "ror %D1" "\n\t" \ "ror %C1" "\n\t" \ "ror %B1" "\n\t" \ "ror %A1" "\n\t" \ "dec %0" "\n\t" \ "brpl L1_%=" "\n\t" \ "L_%=:" "\n\t" \ : "=r" (nsh), "=r" (num) \ : "0" (nsh), "1" (num) \ ); \ a = num; \ }) /* * Copies a 4 byte char buf to a long and moves the ptr * 10 cycles */ #define c2l(c,l) ({ \ __asm__ ( "mov r30, %A1" "\n\t" \ "mov r31, %B1" "\n\t" \ "ld %A0, Z+" "\n\t" \ "ld %B0, Z+" "\n\t" \ "ld %C0, Z+" "\n\t" \ "ld %D0, Z " "\n\t" \ : "=r" (l) \ : "r" (c) \ : "r30", "r31"); \ }); /* * Copies a long to a 4 byte char buf to a long and * doesn't advances the char ptr * 10 cycles */ #define l2c(l,c) ({ \ __asm__ volatile ( "mov r30, %A0" "\n\t" \ "mov r31, %B0" "\n\t" \ "st Z+, %A1" "\n\t" \ "st Z+, %B1" "\n\t" \ "st Z+, %C1" "\n\t" \ "st Z, %D1" "\n\t" \ : \ : "r" (c), "r" (l) \ : "r30", "r31"); \ }); /* * Performs a 1 byte block roll to the left equiv to * rol32(a, 8) * 5 cycles */ #define brol1(a) ({ \ uint8_t brol1tmp; \ __asm__ ( "mov %1, %D0" "\n\t" \ "mov %D0, %C0" "\n\t" \ "mov %C0, %B0" "\n\t" \ "mov %B0, %A0" "\n\t" \ "mov %A0, %1" "\n\t" \ : "=r"(a), "=r" (brol1tmp) \ : "0" (a) \ ); \ }); /* * Performs a 2 byte block roll to the left equiv to * rol32(a, 16) * 6 cycles */ #define brol2(a) ({ \ uint8_t brol2tmp; \ __asm__ ( "mov %1, %A0" "\n\t" \ "mov %A0, %C0" "\n\t" \ "mov %C0, %1" "\n\t" \ "mov %1, %B0" "\n\t" \ "mov %B0, %D0" "\n\t" \ "mov %D0, %1" "\n\t" \ : "=r"(a), "=r" (brol2tmp) \ : "0" (a) \ ); \ }); /* * Performs a 3 byte block roll to the left equiv to * rol32(a, 24) * 5 cycles */ #define brol3(a) ({ \ uint8_t brol3tmp; \ __asm__ ( "mov %1, %A0" "\n\t" \ "mov %A0, %B0" "\n\t" \ "mov %B0, %C0" "\n\t" \ "mov %C0, %D0" "\n\t" \ "mov %D0, %1" "\n\t" \ : "=r"(a), "=r" (brol3tmp) \ : "0" (a) \ ); \ }); #define bror1(a) (brol3(a)) #define bror2(a) (brol2(a)) #define bror3(a) (brol1(a)) /* * Fast rol to the left using the above primitives. * (switch): 16 cycles * (brol.) : 5 cycles * sub: : 1 cycle * rol32: : 2 + (9n), 0 <= n <= 4 * =============================== * BEST : 16 / 21 cycles (byte boundaries) * AVG : 42 cycles * WORST : 60 cycles */ #define fastrol32(a, n) ({ \ switch ((n)) { \ case 0: break; \ case 1: case 2: case 3: case 4: case 5: rol32 (a, (n)); break; \ case 6: case 7: brol1(a); ror32(a, 8-(n)); break; \ case 8: case 9: case 10: case 11: case 12: brol1(a); rol32(a, (n)-8 ); \ break; \ case 13: case 14: case 15: case 16: brol2(a); ror32(a, 16-(n)); break; \ case 17: case 18: case 19: case 20: brol2(a); rol32(a, (n) -16); break; \ case 21: case 22: case 23: case 24: brol3(a); ror32(a, 24-(n)); break; \ case 25: case 26: case 27: case 28: brol3(a); rol32(a, (n) -24); break; \ case 29: case 30: case 31: ror32(a, 32 - (n)); \ } \ }); // can be improved to eliminate the subtraction #define fastror32(a,n) fastrol32(a, (32-n)) // convert a 2 byte char array to an unsigned short: // [assumes MOST significant byte is first] #define c2sM(c, s) (s = ((unsigned short)(*((c)))) <<8L , \ s|= ((unsigned short)(*((c+1))))) // convert a unsigned short to a 2 byte char array // [assumes MOST significant byte is first] #define s2cM(s, c) (*((c)) = (unsigned short)(((s) >> 8L)&0xff), \ *((c+1)) = (unsigned short)(((s) ) &0xff))