-
2004-09-14
就这样了,俺疲惫不堪
运动补偿那一部分代码,也不知道到运算量会有多少。
好像没起到应有的效果。
put_pixels 和 avg_pixels 的代码要从参考帧读取一个宏块的内容,地址有可能是不连续的,所以原来的代码是一个字节一个字节的取,然后组织成为一个word,但是可以通过判断地址是0/1/2/3在什么字节上对齐,然后做相应的改进,改是改对了,有些微的提高。
-
2004-09-08
嗯,小有成效啊
dct32函数直接编译出来有1000多条指令,4k的cache就没掉了。
之所以出现这种情况是因为中间访存太多。如果把同时需要用到的同一地址调度到一起,则能减少内存访问的指令。事实是调度后的指令数减少到700多条。
#define BF(a, b, c)\
{\
tmp0 = tab[a] + tab[b];\
tmp1 = tab[a] - tab[b];\
tab[a] = tmp0;\
tab[b] = MULL(tmp1, c);\
}#define BF1(a, b, c, d)\
{\
BF(a, b, COS4_0);\
BF(c, d, -COS4_0);\
tab[c] += tab[d];\
}#define BF2(a, b, c, d)\
{\
BF(a, b, COS4_0);\
BF(c, d, -COS4_0);\
tab[c] += tab[d];\
tab[a] += tab[c];\
tab[c] += tab[b];\
tab[b] += tab[d];\
}调度前
/* DCT32 without 1/sqrt(2) coef zero scaling. */
static void dct32(int32_t *out, int32_t *tab)
{
int tmp0, tmp1;/* pass 1 */
BF(0, 31, COS0_0);
BF(1, 30, COS0_1);
BF(2, 29, COS0_2);
BF(3, 28, COS0_3);
BF(4, 27, COS0_4);
BF(5, 26, COS0_5);
BF(6, 25, COS0_6);
BF(7, 24, COS0_7);
BF(8, 23, COS0_8);
BF(9, 22, COS0_9);
BF(10, 21, COS0_10);
BF(11, 20, COS0_11);
BF(12, 19, COS0_12);
BF(13, 18, COS0_13);
BF(14, 17, COS0_14);
BF(15, 16, COS0_15);/* pass 2 */
BF(0, 15, COS1_0);
BF(1, 14, COS1_1);
BF(2, 13, COS1_2);
BF(3, 12, COS1_3);
BF(4, 11, COS1_4);
BF(5, 10, COS1_5);
BF(6, 9, COS1_6);
BF(7, 8, COS1_7);
BF(16, 31, -COS1_0);
BF(17, 30, -COS1_1);
BF(18, 29, -COS1_2);
BF(19, 28, -COS1_3);
BF(20, 27, -COS1_4);
BF(21, 26, -COS1_5);
BF(22, 25, -COS1_6);
BF(23, 24, -COS1_7);
/* pass 3 */
BF(0, 7, COS2_0);
BF(1, 6, COS2_1);
BF(2, 5, COS2_2);
BF(3, 4, COS2_3);
BF(8, 15, -COS2_0);
BF(9, 14, -COS2_1);
BF(10, 13, -COS2_2);
BF(11, 12, -COS2_3);
BF(16, 23, COS2_0);
BF(17, 22, COS2_1);
BF(18, 21, COS2_2);
BF(19, 20, COS2_3);
BF(24, 31, -COS2_0);
BF(25, 30, -COS2_1);
BF(26, 29, -COS2_2);
BF(27, 28, -COS2_3);/* pass 4 */
BF(0, 3, COS3_0);
BF(1, 2, COS3_1);
BF(4, 7, -COS3_0);
BF(5, 6, -COS3_1);
BF(8, 11, COS3_0);
BF(9, 10, COS3_1);
BF(12, 15, -COS3_0);
BF(13, 14, -COS3_1);
BF(16, 19, COS3_0);
BF(17, 18, COS3_1);
BF(20, 23, -COS3_0);
BF(21, 22, -COS3_1);
BF(24, 27, COS3_0);
BF(25, 26, COS3_1);
BF(28, 31, -COS3_0);
BF(29, 30, -COS3_1);
/* pass 5 */
BF1(0, 1, 2, 3);
BF2(4, 5, 6, 7);
BF1(8, 9, 10, 11);
BF2(12, 13, 14, 15);
BF1(16, 17, 18, 19);
BF2(20, 21, 22, 23);
BF1(24, 25, 26, 27);
BF2(28, 29, 30, 31);
/* pass 6 */
ADD( 8, 12);
ADD(12, 10);
ADD(10, 14);
ADD(14, 9);
ADD( 9, 13);
ADD(13, 11);
ADD(11, 15);out[ 0] = tab[0];
out[16] = tab[1];
out[ 8] = tab[2];
out[24] = tab[3];
out[ 4] = tab[4];
out[20] = tab[5];
out[12] = tab[6];
out[28] = tab[7];
out[ 2] = tab[8];
out[18] = tab[9];
out[10] = tab[10];
out[26] = tab[11];
out[ 6] = tab[12];
out[22] = tab[13];
out[14] = tab[14];
out[30] = tab[15];
ADD(24, 28);
ADD(28, 26);
ADD(26, 30);
ADD(30, 25);
ADD(25, 29);
ADD(29, 27);
ADD(27, 31);out[ 1] = tab[16] + tab[24];
out[17] = tab[17] + tab[25];
out[ 9] = tab[18] + tab[26];
out[25] = tab[19] + tab[27];
out[ 5] = tab[20] + tab[28];
out[21] = tab[21] + tab[29];
out[13] = tab[22] + tab[30];
out[29] = tab[23] + tab[31];
out[ 3] = tab[24] + tab[20];
out[19] = tab[25] + tab[21];
out[11] = tab[26] + tab[22];
out[27] = tab[27] + tab[23];
out[ 7] = tab[28] + tab[18];
out[23] = tab[29] + tab[19];
out[15] = tab[30] + tab[17];
out[31] = tab[31];
}调度后:
/* DCT32 without 1/sqrt(2) coef zero scaling. */
static void dct32(int32_t *out, int32_t *tab)
{
int tmp0, tmp1;/* pass 1 */
/* pass 2 */
BF(0, 31, COS0_0);
BF(15, 16, COS0_15);
BF(0, 15, COS1_0);
BF(16, 31, -COS1_0);BF(1, 30, COS0_1);
BF(14, 17, COS0_14);
BF(1, 14, COS1_1);
BF(17, 30, -COS1_1);BF(2, 29, COS0_2);
BF(13, 18, COS0_13);
BF(2, 13, COS1_2);
BF(18, 29, -COS1_2);
BF(3, 28, COS0_3);
BF(12, 19, COS0_12);
BF(3, 12, COS1_3);
BF(19, 28, -COS1_3);
BF(4, 27, COS0_4);
BF(11, 20, COS0_11);
BF(4, 11, COS1_4);
BF(20, 27, -COS1_4);BF(5, 26, COS0_5);
BF(10, 21, COS0_10);
BF(5, 10, COS1_5);
BF(21, 26, -COS1_5);BF(6, 25, COS0_6);
BF(9, 22, COS0_9);
BF(6, 9, COS1_6);
BF(22, 25, -COS1_6);BF(7, 24, COS0_7);
BF(8, 23, COS0_8);
BF(7, 8, COS1_7);
BF(23, 24, -COS1_7);
/* pass 3 */
/* pass 4 */
/* pass 5 */
BF(0, 7, COS2_0);
BF(1, 6, COS2_1);
BF(2, 5, COS2_2);
BF(3, 4, COS2_3);
BF(0, 3, COS3_0);
BF(1, 2, COS3_1);
BF(4, 7, -COS3_0);
BF(5, 6, -COS3_1);
BF1(0, 1, 2, 3);
BF2(4, 5, 6, 7);out[ 0] = tab[0];
out[16] = tab[1];
out[ 8] = tab[2];
out[24] = tab[3];
out[ 4] = tab[4];
out[20] = tab[5];
out[12] = tab[6];
out[28] = tab[7];
BF(8, 15, -COS2_0);
BF(9, 14, -COS2_1);
BF(10, 13, -COS2_2);
BF(11, 12, -COS2_3);
BF(8, 11, COS3_0);
BF(9, 10, COS3_1);
BF(12, 15, -COS3_0);
BF(13, 14, -COS3_1);
BF1(8, 9, 10, 11);
BF2(12, 13, 14, 15);ADD( 8, 12);
ADD(12, 10);
ADD(10, 14);
ADD(14, 9);
ADD( 9, 13);
ADD(13, 11);
ADD(11, 15);out[ 2] = tab[8];
out[18] = tab[9];
out[10] = tab[10];
out[26] = tab[11];
out[ 6] = tab[12];
out[22] = tab[13];
out[14] = tab[14];
out[30] = tab[15];
BF(16, 23, COS2_0);
BF(17, 22, COS2_1);
BF(18, 21, COS2_2);
BF(19, 20, COS2_3);
BF(16, 19, COS3_0);
BF(17, 18, COS3_1);
BF(20, 23, -COS3_0);
BF(21, 22, -COS3_1);
BF1(16, 17, 18, 19);
BF2(20, 21, 22, 23);
BF(24, 31, -COS2_0);
BF(25, 30, -COS2_1);
BF(26, 29, -COS2_2);
BF(27, 28, -COS2_3);
BF(24, 27, COS3_0);
BF(25, 26, COS3_1);
BF(28, 31, -COS3_0);
BF(29, 30, -COS3_1);
BF1(24, 25, 26, 27);
BF2(28, 29, 30, 31);
/* pass 6 */
ADD(24, 28);
ADD(28, 26);
ADD(26, 30);
ADD(30, 25);
ADD(25, 29);
ADD(29, 27);
ADD(27, 31);out[ 1] = tab[16] + tab[24];
out[17] = tab[17] + tab[25];
out[ 9] = tab[18] + tab[26];
out[25] = tab[19] + tab[27];
out[ 5] = tab[20] + tab[28];
out[21] = tab[21] + tab[29];
out[13] = tab[22] + tab[30];
out[29] = tab[23] + tab[31];
out[ 3] = tab[24] + tab[20];
out[19] = tab[25] + tab[21];
out[11] = tab[26] + tab[22];
out[27] = tab[27] + tab[23];
out[ 7] = tab[28] + tab[18];
out[23] = tab[29] + tab[19];
out[15] = tab[30] + tab[17];
out[31] = tab[31];
}此外,像这种代码
BF(16, 31, -COS1_0);
后面的一个负系数,可以变化成正的,只要将宏中的,(tab[a]-tab[b])×(-C)变成,(tab[b]-tab[a])×C即可,所以征服系数是可以共享的 -
2004-09-07
怎么办啊,这个代码
#define SUM8(sum, op, w, p) \
{ \
sum op MULS((w)[0 * 64], p[0 * 64]);\
sum op MULS((w)[1 * 64], p[1 * 64]);\
sum op MULS((w)[2 * 64], p[2 * 64]);\
sum op MULS((w)[3 * 64], p[3 * 64]);\
sum op MULS((w)[4 * 64], p[4 * 64]);\
sum op MULS((w)[5 * 64], p[5 * 64]);\
sum op MULS((w)[6 * 64], p[6 * 64]);\
sum op MULS((w)[7 * 64], p[7 * 64]);\
}#define SUM8P2(sum1, op1, sum2, op2, w1, w2, p) \
{ \
int tmp;\
tmp = p[0 * 64];\
sum1 op1 MULS((w1)[0 * 64], tmp);\
sum2 op2 MULS((w2)[0 * 64], tmp);\
tmp = p[1 * 64];\
sum1 op1 MULS((w1)[1 * 64], tmp);\
sum2 op2 MULS((w2)[1 * 64], tmp);\
tmp = p[2 * 64];\
sum1 op1 MULS((w1)[2 * 64], tmp);\
sum2 op2 MULS((w2)[2 * 64], tmp);\
tmp = p[3 * 64];\
sum1 op1 MULS((w1)[3 * 64], tmp);\
sum2 op2 MULS((w2)[3 * 64], tmp);\
tmp = p[4 * 64];\
sum1 op1 MULS((w1)[4 * 64], tmp);\
sum2 op2 MULS((w2)[4 * 64], tmp);\
tmp = p[5 * 64];\
sum1 op1 MULS((w1)[5 * 64], tmp);\
sum2 op2 MULS((w2)[5 * 64], tmp);\
tmp = p[6 * 64];\
sum1 op1 MULS((w1)[6 * 64], tmp);\
sum2 op2 MULS((w2)[6 * 64], tmp);\
tmp = p[7 * 64];\
sum1 op1 MULS((w1)[7 * 64], tmp);\
sum2 op2 MULS((w2)[7 * 64], tmp);\
}把这段代码,给换了一下
/* 32 sub band synthesis filter. Input: 32 sub band samples, Output:
32 samples. */
/* XXX: optimize by avoiding ring buffer usage */
static void synth_filter(MPADecodeContext *s1,
int ch, int16_t *samples, int incr,
int32_t sb_samples[SBLIMIT])
{
int32_t tmp[32];
register MPA_INT *synth_buf;
register const MPA_INT *w, *w2, *p;
int j, offset, v;
int16_t *samples2;
#if FRAC_BITS <= 15
int sum, sum2;
#else
int64_t sum, sum2;
#endif
dct32(tmp, sb_samples);
offset = s1->synth_buf_offset[ch];
synth_buf = s1->synth_buf[ch] + offset;for(j=0;j<32;j++) {
v = tmp[j];
#if FRAC_BITS <= 15
/* NOTE: can cause a loss in precision if very high amplitude
sound */
if (v > 32767)
v = 32767;
else if (v < -32768)
v = -32768;
#endif
synth_buf[j] = v;
}
/* copy to avoid wrap */
memcpy(synth_buf + 512, synth_buf, 32 * sizeof(MPA_INT));samples2 = samples + 31 * incr;
w = window;
w2 = window + 31;sum = 0;
p = synth_buf + 16;
SUM8(sum, +=, w, p);
p = synth_buf + 48;
SUM8(sum, -=, w + 32, p);
*samples = round_sample(sum);
samples += incr;
w++;/* we calculate two samples at the same time to avoid one memory
access per two sample */
for(j=1;j<16;j++) {
sum = 0;
sum2 = 0;
p = synth_buf + 16 + j;
SUM8P2(sum, +=, sum2, -=, w, w2, p);
p = synth_buf + 48 - j;
SUM8P2(sum, -=, sum2, -=, w + 32, w2 + 32, p);*samples = round_sample(sum);
samples += incr;
*samples2 = round_sample(sum2);
samples2 -= incr;
w++;
w2--;
}
p = synth_buf + 32;
sum = 0;
SUM8(sum, -=, w + 32, p);
*samples = round_sample(sum);offset = (offset - 32) & 511;
s1->synth_buf_offset[ch] = offset;
}求sum的那段代码是每隔64跳着取数的,把它改成连着取,可以降低data cache miss的比率
-
2004-09-06
gcc编译步骤控制
gcc -E 预处理,生成中间文件.i,宏被替换
gcc -S 编译,不汇编,生成汇编文件.s
gcc -c 汇编,生成二进制文件.obj
gcc 不加开关,默认生成可执行文件.outgcc -g 增加符号表,用于调试使用
gcc -p 加入profiling代码,用于监测文件各函数调用情况







