使用Openmp在C中并行仿真AES [英] Parallel simulation of AES in C using Openmp

查看:171
本文介绍了使用Openmp在C中并行仿真AES的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

这是我的问题。我想使用Openmp在C中并行化AES-128加密。我几乎没有使用openmp加载以下代码。我的机器是Quadcore intel i5机器。



这是代码。任何建议如何进一步并行化这个代码将真的非常感激。请看代码末尾的主要功能。下面的AES代码由几个功能来实现其功能。请建议如何最好地从中提取并行性。



非常感谢。

  / * 
******************************************* *************************
**高级加密标准实现在C. **
**由Niyaz PK * *
**电子邮件:niyazpk@gmail.com **
**从网站下载:www.hoozi.com **
*********** ************************************************** *****
这是使用最新AES算法进行加密的源代码。
********************************************** ********************
* /

//包含标准输入/输出的stdio.h。
//用于输出到屏幕。
#include< omp.h>
#include< stdio.h>
#include< time.h>
#include< stdlib.h>


//包含AES状态的列数。这是AES中的常数。 Value = 4
#define Nb 4

// AES密码中的回合数。它被简单地启动到零。在程序中收到实际值。
int Nr = 0;

//键中的32位字数。它被简单地启动到零。在程序中收到实际值。
int Nk = 0;

// in - 它是保存要加密的纯文本的数组。
// out - 它是加密后保存输出CipherText的数组。
// state - 在加密期间保存中间结果的数组。
unsigned char in [16],out [16],state [4] [4];

//存储循环密钥的数组。
unsigned char RoundKey [240];

// AES程序的密钥输入
unsigned char Key [32];



int getSBoxValue(int num)
{
int sbox [256] = {
// 0 1 2 3 4 5 6 7 8 9 ABCDEF
0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,// 0
0xca, 0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,// 1
0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7 ,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,// 2
0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2, 0xeb,0x27,0xb2,0x75,// 3
0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,// 4
0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,// 5
0xd0,0xef,0xaa,0xfb ,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,// 6
0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38, 0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,// 7
0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64 ,0x5d,0x19,0x73,// 8
0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,// 9
0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,//A
0xe7,0xc8,0x37,0x6d, 0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,//B
0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd ,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,// C
0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d, 0x9e,// D
0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55​​,0x28,0xdf,//E
0x8c, 0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16}; // F
return sbox [num];
}

//圆的常数字数组Rcon [i]包含由
// x给出的值,功率(i-1)为在字段GF(28)中,x(x表示为{02})
//请注意,我从1开始,不是0)。
int Rcon [255] = {
0x8d,0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80,0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,
0x2f,0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4,0xb3,0x7d,0xfa,0xef,0xc5,0x91,0x39,
0x72,0xe4,0xd3,0xbd,0x61,0xc2,0x9f ,0x25,0x4a,0x94,0x33,0x66,0xcc,0x83,0x1d,0x3a,
0x74,0xe8,0xcb,0x8d,0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80,0x1b,0x36 ,0x6c,0xd8,
0xab,0x4d,0x9a,0x2f,0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4,0xb3,0x7d,0xfa,0xef,
0xc5,0x91,0x39 ,0x72,0xe4,0xd3,0xbd,0x61,0xc2,0x9f,0x25,0x4a,0x94,0x33,0x66,0xcc,
0x83,0x1d,0x3a,0x74,0xe8,0xcb,0x8d,0x01,0x02,0x04 ,0x08,0x10,0x20,0x40,0x80,0x1b,
0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f,0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4,0xb3,
0x7d,0xfa,0xef,0xc5,0x91,0x39,0x72,0xe4,0xd3,0xbd,0x61,0xc2,0x9f,0x25,0x4a,0x94,
0x33,0x66,0x cc,0x83,0x1d,0x3a,0x74,0xe8,0xcb,0x8d,0x01,0x02,0x04,0x08,0x10,0x20,
0x40,0x80,0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a, 0x2f,0x5e,0xbc,0x63,0xc6,0x97,0x35,
0x6a,0xd4,0xb3,0x7d,0xfa,0xef,0xc5,0x91,0x39,0x72,0xe4,0xd3,0xbd,0x61,0xc2,0x9f,
0x25,0x4a,0x94,0x33,0x66,0xcc,0x83,0x1d,0x3a,0x74,0xe8,0xcb,0x8d,0x01,0x02,0x04,
0x08,0x10,0x20,0x40,0x80, 0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f,0x5e,0xbc,0x63,
0xc6,0x97,0x35,0x6a,0xd4,0xb3,0x7d,0xfa,0xef,0xc5,0x91,0x39, 0x72,0xe4,0xd3,0xbd,
0x61,0xc2,0x9f,0x25,0x4a,0x94,0x33,0x66,0xcc,0x83,0x1d,0x3a,0x74,0xe8,0xcb};

//此函数生成Nb(Nr + 1)圆键。循环密钥用于每一轮加密状态。
void KeyExpansion()
{
int i,j;
unsigned char temp [4],k;

//第一个圆键是键本身。 (i = 0; i< Nk; i ++)

{
RoundKey [i * 4] = Key [i * 4];
RoundKey [i * 4 + 1] =键[i * 4 + 1];
RoundKey [i * 4 + 2] =键[i * 4 + 2];
RoundKey [i * 4 + 3] =键[i * 4 + 3];
}

//所有其​​他循环密钥都是从前一个循环密钥中找到的。
while(i<(Nb *(Nr + 1)))
{
for(j = 0; j< 4; j ++)
{
temp [j] = RoundKey [(i-1)* 4 + j];
}
如果(i%Nk == 0)
{
//此函数将一个字中的4个字节向左旋转一次。
// [a0,a1,a2,a3]成为[a1,a2,a3,a0]

//功能RotWord()
{
k = temp [0];
temp [0] = temp [1];
temp [1] = temp [2];
temp [2] = temp [3];
temp [3] = k;
}

// SubWord()是一个接收四字节输入字的函数,
//将S-box应用于四个字节中的每一个,以产生输出字。

//功能子字()
{
temp [0] = getSBoxValue(temp [0]);
temp [1] = getSBoxValue(temp [1]);
temp [2] = getSBoxValue(temp [2]);
temp [3] = getSBoxValue(temp [3]);
}

temp [0] = temp [0] ^ Rcon [i / Nk];
}
else if(Nk> 6&& i%Nk == 4)
{
//功能子字()
{
temp [0] = getSBoxValue(temp [0]);
temp [1] = getSBoxValue(temp [1]);
temp [2] = getSBoxValue(temp [2]);
temp [3] = getSBoxValue(temp [3]);
}
}
RoundKey [i * 4 + 0] = RoundKey [(i-Nk)* 4 + 0] ^ temp [0];
RoundKey [i * 4 + 1] = RoundKey [(i-Nk)* 4 + 1] ^ temp [1];
RoundKey [i * 4 + 2] = RoundKey [(i-Nk)* 4 + 2] ^ temp [2];
RoundKey [i * 4 + 3] = RoundKey [(i-Nk)* 4 + 3] ^ temp [3];
i ++;
}
}

//此函数将循环密钥添加到状态。
//循环密钥由XOR函数添加到状态。
void AddRoundKey(int round)
{
int i,j; (i = 0; i <4; i ++)

(j = 0; j <4; j ++)
{
state [j] [i] ^ = RoundKey [round * Nb * 4 + i * Nb + j];
}
}
}

// SubBytes函数用
//状态矩阵中的值替换S框中的值。
void SubBytes()
{
int i,j; (i = 0; i <4; i ++)
{
for(j = 0; j< 4; j ++)
{
state [i] [j] = getSBoxValue(state [i] [j]);

}
}
}

// ShiftRows()函数将状态向左移动。
//每行都以不同的偏移位移。
// Offset =行号。所以第一行没有移动。
void ShiftRows()
{
unsigned char temp;

//将第一行1列向左旋转
temp = state [1] [0];
state [1] [0] = state [1] [1];
state [1] [1] = state [1] [2];
state [1] [2] = state [1] [3];
state [1] [3] = temp;

//将第二行2列旋转到左边
temp = state [2] [0];
state [2] [0] = state [2] [2];
state [2] [2] = temp;

temp = state [2] [1];
state [2] [1] = state [2] [3];
state [2] [3] = temp;

//将第三行3列旋转到左边
temp = state [3] [0];
state [3] [0] = state [3] [3];
state [3] [3] = state [3] [2];
state [3] [2] = state [3] [1];
state [3] [1] = temp;
}

// xtime是一个宏,它找到{02}的乘积和xtime模数{1b}的参数
#define xtime(x)((x < < 1)^((x>> 7)& 1)* 0x1b))

// MixColumns函数混合状态矩阵的列
//使用的方法可能看起来很复杂,但如果你知道基础理论,这很容易。
//参考上面指定的文件。
void MixColumns()
{
int i;
unsigned char Tmp,Tm,t; (i = 0; i <4; i ++)

{
t = state [0] [i];
Tmp = state [0] [i] ^ state [1] [i] ^ state [2] [i] ^ state [3] [i]
Tm = state [0] [i] ^ state [1] [i]; Tm = xtime(Tm);状态[0] [i] ^ = Tm ^ Tmp;
Tm = state [1] [i] ^ state [2] [i]; Tm = xtime(Tm);状态[1] [i] ^ = Tm ^ Tmp;
Tm = state [2] [i] ^ state [3] [i]; Tm = xtime(Tm);状态[2] [i] ^ = Tm ^ Tmp;
Tm = state [3] [i] ^ t; Tm = xtime(Tm);状态[3] [i] ^ = Tm ^ Tmp;
}
}

// Cipher是加密PlainText的主要功能。
void Cipher()
{
int i,j,round = 0;

//将输入PlainText复制到状态数组。 (i = 0; i <4; i ++)

(j = 0; j <4; j ++)
{
state [j] [i] = in [i * 4 + j];
}
}

//在开始回合之前将第一个回合键添加到状态。
AddRoundKey(0);

//将会有N轮。
//第一个Nr-1轮是相同的。
//这些Nr-1轮在下面的循环中执行。
for(round = 1; round< Nr; round ++)
{
SubBytes();
ShiftRows();
MixColumns();
AddRoundKey(round);
}

//最后一轮如下。
// MixColumns函数不在最后一轮。
SubBytes();
ShiftRows();
AddRoundKey(Nr);

//加密过程结束。
//将状态数组复制到输出数组。 (i = 0; i< 4; i ++)
{
for(j = 0; j< 4; j ++)
{
out [i * 4 + j]的状态= [j]的[I];
}
}
}

void encrypt(int * K,int * PT,int * CT)
{
int i;

// int ct;

//从收到的值计算Nk和Nr。
Nr = 128;
Nk = Nr / 32;
Nr = Nk + 6;


//复制Key和PlainText
(i = 0; i< Nk * 4; i ++)
{
Key [i] = K [I];
在[i] = PT [i];
}

/ *
printf(\\\
Key for encryption:\\\
); (i = 0; i< Nk * 4; i ++)
printf(%02x,Key [i]);

printf(\\\
);
* /
/ *
printf(\\\
Text before encryption:\\\
); (i = 0; i< Nk * 4; i ++)
printf(%02x,in [i]);

printf(\\\
);
* /
//必须在加密之前调用KeyExpansion例程。
KeyExpansion();

//下一个函数调用使用AES算法用Key加密PlainText。
Cipher();


//输出加密文本。
// io_printf(\\\
Text after encrypted:\\\
); (i = 0; i
{
CT [i] = out [i];
printf(%02x,out [i]);
}
printf(\\\
);

// ct = out [15];
// return ct;

}

// main函数
int main()
{


srand(time空值));
unsigned int rnd [4];

int key [16];
int pt [16];
int ct [16];

unsigned int i,j;

#pragma omp parallel for num_threads(4)schedule(dynamic)
for(i = 0; i <65000 * 10; i ++)
{
rnd [ 0] =兰特();
rnd [1] = rand();
rnd [2] = rand();
rnd [3] = rand();

for(j = 0; j <4; j ++)
{
key [4 * j] =(rnd [j]& 0xff)
pt [4 * j] = key [4 * j];
key [4 * j + 1] =((rnd [j]>>>>> 8)& 0xff);
pt [4 * j + 1] = key [4 * j + 1];
key [4 * j + 2] =((rnd [j]>>>> 16)& 0xff);
pt [4 * j + 2] = key [4 * j + 2];
key [4 * j + 3] =((rnd [j]>>> 24)& 0xff);
pt [4 * j + 3] = key [4 * j + 3];
}

#pragma omp task
encrypt(key,pt,ct);

}

返回0;

}

我修改了Hristo建议的代码。谢谢你的努力。以下是代码的外观。我不明白如何使encrypt()函数使用局部变量。你可以解释吗。请添加代码应该在哪里。再次感谢您的努力。
其次,如果没有printf语句,你会看到输出是否正确。我的意思是有其他机制来显示或保存输出。最后,如下所示的代码仍然比串行执行(即,没有openmp)慢。串行版本中没有printf可以使比较公平。

  void encrypt(int * K,int * PT, int * CT)
{
int i;

// int ct;

//从收到的值计算Nk和Nr。
Nr = 128;
Nk = Nr / 32;
Nr = Nk + 6;


//复制Key和PlainText
(i = 0; i< Nk * 4; i ++)
{
Key [i] = K [I];
在[i] = PT [i];
}

/ *
printf(\\\
Key for encryption:\\\
); (i = 0; i< Nk * 4; i ++)
printf(%02x,Key [i]);

printf(\\\
);
* /
/ *
printf(\\\
Text before encryption:\\\
); (i = 0; i< Nk * 4; i ++)
printf(%02x,in [i]);

printf(\\\
);
* /
//必须在加密之前调用KeyExpansion例程。
KeyExpansion();

//下一个函数调用使用AES算法用Key加密PlainText。
Cipher();


//输出加密文本。
// io_printf(\\\
Text after encrypted:\\\
); (i = 0; i
{
CT [i] = out [i];
// printf(%02x,out [i]);
}
// printf(\\\
);

// ct = out [15];
// return ct;

}

// main函数
int main()
{


srand(time空值));
unsigned int rnd [4];

// printf(rand_key =%2x%2x%2x%2x\\\
,rnd [0],rnd [1],rnd [2],rnd [3]);

int key [16];
int pt [16];
int ct [16];

unsigned int i,j;
#pragma omp parallel for private(key,pt,ct)num_threads(2)schedule(static)
for(i = 0; i <65000; i ++)
{
RND [0] =兰特();
rnd [1] = rand();
rnd [2] = rand();
rnd [3] = rand();

for(j = 0; j <4; j ++)
{
key [4 * j] =(rnd [j]& 0xff)
pt [4 * j] = key [4 * j];
key [4 * j + 1] =((rnd [j]>>>>> 8)& 0xff);
pt [4 * j + 1] = key [4 * j + 1];
key [4 * j + 2] =((rnd [j]>>>> 16)& 0xff);
pt [4 * j + 2] = key [4 * j + 2];
key [4 * j + 3] =((rnd [j]>>> 24)& 0xff);
pt [4 * j + 3] = key [4 * j + 3];
}

encrypt(key,pt,ct);


}

返回0;

}


解决方案

schedule(dynamic) task 就我所知AES的本质而言,这是一个完全常规的问题 - 每个加密都需要完全相同的周期数,因此相同的挂钟时间,无论什么关键。这完全排除了使用动态调度和任务的必要性。即使在出现严重问题的情况下,只需添加 schedule(dynamic)是一个非常糟糕的主意。原因是动态的默认块大小是 1 ,这意味着每个线程执行单个迭代然后询问OpenMP运行时的另一个。在你的情况下,开销乘以650000次。动态调度在实际应用时非常强大,但是应该仔细选择最佳块大小,后者通常涉及大量试验,直到找到最优值。



此外生成650000个任务。每个任务具有与其工作线程的创建和随后消耗相关联的一定开销。鉴于AES在Pentium Pro上每字节大约需要18个周期(参考:维基百科)每个调用 encrypt()可能会花费与OpenMP运行时需要的时间相同,以便执行该任务,如果它不是 printf()语句里面。 printf()输出到终端或文件流(如果重定向)并且使用相同的描述符执行I / O本质上是一个串行操作 ,即它连接线程。请参阅此答案了解多少 printf()影响并行性能。



但是,代码中最糟糕的问题实际上是众多的数据竞争。 encrypt()取决于并更改几个全局变量的值。这不仅导致由于真正的缓存共享而减慢,而且很可能也导致完全错误的密文。这些全局变量应该全部在 encrypt()的本地生成,或者如果需要保留全局,则应为 threadprivate 。然后并行循环使用几个共享变量,即 key pt ct 。这些应该是 private



总结:make encrypt()只使用局部变量;使 pt ct 私有;将循环调度更改为 static ;删除任务构造;删除在每次迭代时输出信息的所有 printf 语句。



奖金: rand() 将其状态保存在全局变量中。






有这么多全局变量。只是让他们线程私有。在最后一个全局变量的定义之后添加以下OpenMP pragma:

  ... 
// AES程序的密钥输入
unsigned char Key [32];

#pragma omp threadprivate(Nr,Nk,in,out,state,RoundKey,Key)

...
/ pre>

还可以更改您的 main()函数,如下所示:

  unsigned int i; 
#pragma omp parallel for num_threads(2)schedule(static)
for(i = 0; i< 65000; i ++)
{
unsigned int rnd [4];
int key [16];
int pt [16];
int ct [16];
unsigned int j;
//每线程PRNG初始化
//可以做得更好 - 这仅用于说明目的
unsigned int rand_state = time(NULL)+ 1337 * omp_get_thread_num();

rnd [0] = rand_r(& rand_state);
rnd [1] = rand_r(& rand_state);
rnd [2] = rand_r(& rand_state);
rnd [3] = rand_r(& rand_state);

for(j = 0; j <4; j ++)
{
key [4 * j] =(rnd [j]& 0xff)
pt [4 * j] = key [4 * j];
key [4 * j + 1] =((rnd [j]>>>>> 8)& 0xff);
pt [4 * j + 1] = key [4 * j + 1];
key [4 * j + 2] =((rnd [j]>>>> 16)& 0xff);
pt [4 * j + 2] = key [4 * j + 2];
key [4 * j + 3] =((rnd [j]>>> 24)& 0xff);
pt [4 * j + 3] = key [4 * j + 3];
}

encrypt(key,pt,ct);
}

注意 - 变量,如 pt j 等定义在使用范围内。这样就可以将这些变量全部放在 private 子句中,因为这些变量被预先设定为 private 。现在每个线程都有自己的PRNG状态。


Here is my problem. I want to parallelize AES-128 encryption in C using Openmp. I am hardly getting any speedup with the following code using openmp. My machine is Quadcore intel i5 machine.

Here is the code. Any suggestion how to parallelize this code further would be really really appreciated. Please take a look at the main function which is at the end of the code. AES code below consists of a few functions to achieve its functionality. Please suggest how to best extract parallelism from this.

Thanks so much.

/*
******************************************************************
**       Advanced Encryption Standard implementation in C.      **
**       By Niyaz PK                                            **
**       E-mail: niyazpk@gmail.com                              **
**       Downloaded from Website: www.hoozi.com                 **
******************************************************************
This is the source code for encryption using the latest AES algorithm.
******************************************************************
*/

// Include stdio.h for standard input/output.
// Used for giving output to the screen.
#include<omp.h>
#include<stdio.h>
#include<time.h>
#include<stdlib.h>


// The number of columns comprising a state in AES. This is a constant in AES. Value=4
#define Nb 4

// The number of rounds in AES Cipher. It is simply initiated to zero. The actual value is recieved in the program.
int Nr=0;

// The number of 32 bit words in the key. It is simply initiated to zero. The actual value is recieved in the program.
int Nk=0;

// in - it is the array that holds the plain text to be encrypted.
// out - it is the array that holds the output CipherText after encryption.
// state - the array that holds the intermediate results during encryption.
unsigned char in[16], out[16], state[4][4];

// The array that stores the round keys.
unsigned char RoundKey[240];

// The Key input to the AES Program
unsigned char Key[32];



int getSBoxValue(int num)
{
    int sbox[256] =   {
    //0     1    2      3     4    5     6     7      8    9     A      B    C     D     E     F
    0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, //0
    0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, //1
    0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, //2
    0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, //3
    0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, //4
    0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, //5
    0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, //6
    0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, //7
    0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, //8
    0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, //9
    0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, //A
    0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, //B
    0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, //C
    0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, //D
    0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, //E
    0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }; //F
    return sbox[num];
}

// The round constant word array, Rcon[i], contains the values given by 
// x to th e power (i-1) being powers of x (x is denoted as {02}) in the field GF(28)
// Note that i starts at 1, not 0).
int Rcon[255] = {
    0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 
    0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 
    0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 
    0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 
    0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 
    0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 
    0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 
    0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 
    0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 
    0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 
    0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 
    0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 
    0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 
    0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 
    0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 
    0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb  };

// This function produces Nb(Nr+1) round keys. The round keys are used in each round to encrypt the states. 
void KeyExpansion()
{
    int i,j;
    unsigned char temp[4],k;

    // The first round key is the key itself.
    for(i=0;i<Nk;i++)
    {
        RoundKey[i*4]=Key[i*4];
        RoundKey[i*4+1]=Key[i*4+1];
        RoundKey[i*4+2]=Key[i*4+2];
        RoundKey[i*4+3]=Key[i*4+3];
    }

    // All other round keys are found from the previous round keys.
    while (i < (Nb * (Nr+1)))
    {
        for(j=0;j<4;j++)
        {
            temp[j]=RoundKey[(i-1) * 4 + j];
        }
        if (i % Nk == 0)
        {
            // This function rotates the 4 bytes in a word to the left once.
            // [a0,a1,a2,a3] becomes [a1,a2,a3,a0]

            // Function RotWord()
            {
                k = temp[0];
                temp[0] = temp[1];
                temp[1] = temp[2];
                temp[2] = temp[3];
                temp[3] = k;
            }

            // SubWord() is a function that takes a four-byte input word and 
            // applies the S-box to each of the four bytes to produce an output word.

            // Function Subword()
            {
                temp[0]=getSBoxValue(temp[0]);
                temp[1]=getSBoxValue(temp[1]);
                temp[2]=getSBoxValue(temp[2]);
                temp[3]=getSBoxValue(temp[3]);
            }

            temp[0] =  temp[0] ^ Rcon[i/Nk];
        }
        else if (Nk > 6 && i % Nk == 4)
        {
            // Function Subword()
            {
                temp[0]=getSBoxValue(temp[0]);
                temp[1]=getSBoxValue(temp[1]);
                temp[2]=getSBoxValue(temp[2]);
                temp[3]=getSBoxValue(temp[3]);
            }
        }
        RoundKey[i*4+0] = RoundKey[(i-Nk)*4+0] ^ temp[0];
        RoundKey[i*4+1] = RoundKey[(i-Nk)*4+1] ^ temp[1];
        RoundKey[i*4+2] = RoundKey[(i-Nk)*4+2] ^ temp[2];
        RoundKey[i*4+3] = RoundKey[(i-Nk)*4+3] ^ temp[3];
        i++;
    }
}

// This function adds the round key to state.
// The round key is added to the state by an XOR function.
void AddRoundKey(int round) 
{
    int i,j;
    for(i=0;i<4;i++)
    {
        for(j=0;j<4;j++)
        {
            state[j][i] ^= RoundKey[round * Nb * 4 + i * Nb + j];
        }
    }
}

// The SubBytes Function Substitutes the values in the
// state matrix with values in an S-box.
void SubBytes()
{
    int i,j;
    for(i=0;i<4;i++)
    {
        for(j=0;j<4;j++)
        {
            state[i][j] = getSBoxValue(state[i][j]);

        }
    }
}

// The ShiftRows() function shifts the rows in the state to the left.
// Each row is shifted with different offset.
// Offset = Row number. So the first row is not shifted.
void ShiftRows()
{
    unsigned char temp;

    // Rotate first row 1 columns to left    
    temp=state[1][0];
    state[1][0]=state[1][1];
    state[1][1]=state[1][2];
    state[1][2]=state[1][3];
    state[1][3]=temp;

    // Rotate second row 2 columns to left    
    temp=state[2][0];
    state[2][0]=state[2][2];
    state[2][2]=temp;

    temp=state[2][1];
    state[2][1]=state[2][3];
    state[2][3]=temp;

    // Rotate third row 3 columns to left
    temp=state[3][0];
    state[3][0]=state[3][3];
    state[3][3]=state[3][2];
    state[3][2]=state[3][1];
    state[3][1]=temp;
}

// xtime is a macro that finds the product of {02} and the argument to xtime modulo {1b}  
#define xtime(x)   ((x<<1) ^ (((x>>7) & 1) * 0x1b))

// MixColumns function mixes the columns of the state matrix
// The method used may look complicated, but it is easy if you know the underlying theory.
// Refer the documents specified above.
void MixColumns()
{
    int i;
    unsigned char Tmp,Tm,t;
    for(i=0;i<4;i++)
    {    
        t=state[0][i];
        Tmp = state[0][i] ^ state[1][i] ^ state[2][i] ^ state[3][i] ;
        Tm = state[0][i] ^ state[1][i] ; Tm = xtime(Tm); state[0][i] ^= Tm ^ Tmp ;
        Tm = state[1][i] ^ state[2][i] ; Tm = xtime(Tm); state[1][i] ^= Tm ^ Tmp ;
        Tm = state[2][i] ^ state[3][i] ; Tm = xtime(Tm); state[2][i] ^= Tm ^ Tmp ;
        Tm = state[3][i] ^ t ; Tm = xtime(Tm); state[3][i] ^= Tm ^ Tmp ;
    }
}

// Cipher is the main function that encrypts the PlainText.
void Cipher()
{
    int i,j,round=0;

    //Copy the input PlainText to state array.
    for(i=0;i<4;i++)
    {
        for(j=0;j<4;j++)
        {
            state[j][i] = in[i*4 + j];
        }
    }

    // Add the First round key to the state before starting the rounds.
    AddRoundKey(0); 

    // There will be Nr rounds.
    // The first Nr-1 rounds are identical.
    // These Nr-1 rounds are executed in the loop below.
    for(round=1;round<Nr;round++)
    {
        SubBytes();
        ShiftRows();
        MixColumns();
        AddRoundKey(round);
    }

    // The last round is given below.
    // The MixColumns function is not here in the last round.
    SubBytes();
    ShiftRows();
    AddRoundKey(Nr);

    // The encryption process is over.
    // Copy the state array to output array.
    for(i=0;i<4;i++)
    {
        for(j=0;j<4;j++)
        {
            out[i*4+j]=state[j][i];
        }
    }
}

void encrypt(int *K,int *PT,int *CT)
{
    int i;

    //    int ct;

    // Calculate Nk and Nr from the received value.
    Nr = 128;
    Nk = Nr / 32;
    Nr = Nk + 6;


     // Copy the Key and PlainText
    for(i=0;i<Nk*4;i++)
    {
        Key[i]=K[i];
        in[i]=PT[i];
    }

   /* 
   printf("\nKey for encryption:\n");
    for(i=0; i < Nk*4; i++)
      printf("%02x",Key[i]);
    printf("\n");
*/
/*
    printf("\nText before encryption:\n");
    for(i=0; i < Nk*4; i++)
      printf("%02x",in[i]);
    printf("\n");
*/    
    // The KeyExpansion routine must be called before encryption.
    KeyExpansion();

    // The next function call encrypts the PlainText with the Key using AES algorithm.
    Cipher();


    // Output the encrypted text.
    //io_printf("\nText after encryption:\n");
     for(i=0; i < Nk*4; i++)
    {
        CT[i] = out[i];
        printf("%02x",out[i]);
      }
    printf("\n");

    //  ct = out[15];
    // return ct;

}

//main function
int main()
{


  srand(time(NULL));
  unsigned int rnd[4];

  int key[16];
  int pt[16];
  int ct[16];

  unsigned int i,j;

  #pragma omp parallel for num_threads(4) schedule(dynamic)
  for(i=0; i<65000*10; i++)
  {
   rnd[0]=rand();
   rnd[1]=rand();
   rnd[2]=rand();
   rnd[3]=rand();

   for(j=0; j < 4; j++)
   {
    key[4*j]   = (rnd[j] & 0xff);
    pt[4*j]    = key[4*j];
    key[4*j+1] = ((rnd[j] >> 8)  & 0xff) ; 
    pt[4*j+1]  = key[4*j+1];
    key[4*j+2] = ((rnd[j] >> 16) & 0xff) ;
    pt[4*j+2]  = key[4*j+2];
    key[4*j+3] = ((rnd[j] >> 24) & 0xff) ;
    pt[4*j+3]  = key[4*j+3];
   }

   #pragma omp task      
   encrypt(key,pt,ct);

  }

  return 0;

}

I have modified the code as suggested by Hristo. Thanks for your effort. Here is how the code looks. I don't understand how to make encrypt( ) function use local variables. Can you explain. Please add code where it should be. Thanks again for your efforts. Secondly, if there is no printf statements, how would you see if the output is correct or not. I mean are there other mechanisms to display or save the output. Lastly, the code as shown below is still slower than the serial execution (i.e., without openmp). There is no printf in the serial version either to make the comparison fair.

void encrypt(int *K,int *PT,int *CT)
{
    int i;

    //    int ct;

    // Calculate Nk and Nr from the received value.
    Nr = 128;
    Nk = Nr / 32;
    Nr = Nk + 6;


     // Copy the Key and PlainText
    for(i=0;i<Nk*4;i++)
    {
        Key[i]=K[i];
        in[i]=PT[i];
    }

   /* 
   printf("\nKey for encryption:\n");
    for(i=0; i < Nk*4; i++)
      printf("%02x",Key[i]);
    printf("\n");
*/
/*
    printf("\nText before encryption:\n");
    for(i=0; i < Nk*4; i++)
      printf("%02x",in[i]);
    printf("\n");
*/    
    // The KeyExpansion routine must be called before encryption.
    KeyExpansion();

    // The next function call encrypts the PlainText with the Key using AES algorithm.
    Cipher();


    // Output the encrypted text.
    //io_printf("\nText after encryption:\n");
     for(i=0; i < Nk*4; i++)
    {
        CT[i] = out[i];
//        printf("%02x",out[i]);
      }
//    printf("\n");

    //  ct = out[15];
    // return ct;

}

//main function
int main()
{


  srand(time(NULL));
  unsigned int rnd[4];

//  printf("rand_key = %2x%2x%2x%2x\n",rnd[0],rnd[1],rnd[2],rnd[3]);

  int key[16];
  int pt[16];
  int ct[16];

  unsigned int i,j;
  #pragma omp parallel for private(key,pt,ct) num_threads(2) schedule(static)
  for(i=0; i<65000; i++)
  {
   rnd[0]=rand();
   rnd[1]=rand();
   rnd[2]=rand();
   rnd[3]=rand();

   for(j=0; j < 4; j++)
   {
    key[4*j]   = (rnd[j] & 0xff);
    pt[4*j]    = key[4*j];
    key[4*j+1] = ((rnd[j] >> 8)  & 0xff) ; 
    pt[4*j+1]  = key[4*j+1];
    key[4*j+2] = ((rnd[j] >> 16) & 0xff) ;
    pt[4*j+2]  = key[4*j+2];
    key[4*j+3] = ((rnd[j] >> 24) & 0xff) ;
    pt[4*j+3]  = key[4*j+3];
   }

   encrypt(key,pt,ct);


  }

  return 0;

}

解决方案

You need neither schedule(dynamic) nor task constructs. As far as I am aware of the intrinsics of AES, this is a completely regular problem - each encryption takes exactly the same number of cycles and therefore the same wall-clock time, no matter what the key. This completly rules out the necessity to use dynamic scheduling and tasks. Even in the case of umbalanced problems, simply adding schedule(dynamic) is a very bad idea. The reason for that is that the default chunk size for dynamic is 1, which means that each thread executes a single iteration and then asks the OpenMP runtime for another one. In your case the overhead is multiplied 650000 times. Dynamic scheduling, when actually applicable, is very powerful but one should carefully choose the optimal chunk size with the latter often involving lots of trials until the optimal value is found.

Besides that you generate 650000 tasks. Each task has a certain overhead associated with its creation and subsequent consumption by the worker threads. Given that AES takes about 18 cycles per byte on Pentium Pro (ref: Wikipedia), each call to encrypt() probably would have taken about the same amount of time as the OpenMP runtime needs in order to execute the task if it wasn't for the printf() statement inside. printf() outputs to the terminal or to a file stream (if redirected) and doing I/O with the same descriptor is esentially a serial operation, i.e. it serialises the threads. See this answer to get an idea of how much printf() impacts parallel performance.

But the worst problem of your code is actually the multitude of data races. encrypt() depends on and changes the values of several global variables. This not only leads to slow down due to true cache sharing, but most likely also results in completely wrong ciphertexts. These global variables should be all made local to encrypt() or made threadprivate if it is necessary that they stay global. Then the parallel loop uses several shared variables, namely key, pt and ct. These should be made private.

Summary: make encrypt() use only local variables; make key, pt and ct private; change the loop schedule to static; remove the task construct; remove all printf statements that output information at each iteration.

Bonus: rand() keeps its state in global variables too.


There are so many global variables. Just make them thread-private. Add the following OpenMP pragma just after the definition of the last global variable:

...
// The Key input to the AES Program
unsigned char Key[32];

#pragma omp threadprivate(Nr,Nk,in,out,state,RoundKey,Key)

...

Also change your main() function as follows:

unsigned int i;
#pragma omp parallel for num_threads(2) schedule(static)
for(i = 0; i < 65000; i++)
{
  unsigned int rnd[4];
  int key[16];
  int pt[16];
  int ct[16];
  unsigned int j;
  // Per-thread PRNG initialisation
  // It could be done better - this is for illustration purposes only
  unsigned int rand_state = time(NULL) + 1337*omp_get_thread_num();

  rnd[0] = rand_r(&rand_state);
  rnd[1] = rand_r(&rand_state);
  rnd[2] = rand_r(&rand_state);
  rnd[3] = rand_r(&rand_state);

  for(j = 0; j < 4; j++)
  {
    key[4*j]   = (rnd[j] & 0xff);
    pt[4*j]    = key[4*j];
    key[4*j+1] = ((rnd[j] >> 8)  & 0xff) ; 
    pt[4*j+1]  = key[4*j+1];
    key[4*j+2] = ((rnd[j] >> 16) & 0xff) ;
    pt[4*j+2]  = key[4*j+2];
    key[4*j+3] = ((rnd[j] >> 24) & 0xff) ;
    pt[4*j+3]  = key[4*j+3];
  }

  encrypt(key, pt, ct);
}

Notice - variables such as key, pt, j, etc. are defined in the scope where they are used. This frees you from the necessity to put them all in a private clause since such variables are predetermined to be private. Also each thread now has its own PRNG state.

这篇关于使用Openmp在C中并行仿真AES的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!

查看全文
登录 关闭
扫码关注1秒登录
发送“验证码”获取 | 15天全站免登陆