arm correlate q15 8c source


CMSIS DSP Software Library: arm_correlate_q15.c Source File Main Page Modules Data Structures Files Examples File List Globals arm_correlate_q15.c Go to the documentation of this file.00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_correlate_q15.c 00009 * 00010 * Description: Q15 Correlation. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated 00025 * 00026 * Version 0.0.7 2010/06/10 00027 * Misra-C changes done 00028 * 00029 * -------------------------------------------------------------------- */ 00030 00031 #include "arm_math.h" 00032 00065 void arm_correlate_q15( 00066 q15_t * pSrcA, 00067 uint32_t srcALen, 00068 q15_t * pSrcB, 00069 uint32_t srcBLen, 00070 q15_t * pDst) 00071 { 00072 q15_t *pIn1; /* inputA pointer */ 00073 q15_t *pIn2; /* inputB pointer */ 00074 q15_t *pOut = pDst; /* output pointer */ 00075 q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00076 q15_t *px; /* Intermediate inputA pointer */ 00077 q15_t *py; /* Intermediate inputB pointer */ 00078 q15_t *pSrc1; /* Intermediate pointers */ 00079 q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */ 00080 uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */ 00081 int32_t inc = 1; /* Destination address modifier */ 00082 q31_t *pb; /* 32 bit pointer for inputB buffer */ 00083 00084 00085 /* The algorithm implementation is based on the lengths of the inputs. */ 00086 /* srcB is always made to slide across srcA. */ 00087 /* So srcBLen is always considered as shorter or equal to srcALen */ 00088 /* But CORR(x, y) is reverse of CORR(y, x) */ 00089 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */ 00090 /* and the destination pointer modifier, inc is set to -1 */ 00091 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */ 00092 /* But to improve the performance, 00093 * we include zeroes in the output instead of zero padding either of the the inputs*/ 00094 /* If srcALen > srcBLen, 00095 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */ 00096 /* If srcALen < srcBLen, 00097 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */ 00098 if(srcALen >= srcBLen) 00099 { 00100 /* Initialization of inputA pointer */ 00101 pIn1 = (pSrcA); 00102 00103 /* Initialization of inputB pointer */ 00104 pIn2 = (pSrcB); 00105 00106 /* Number of output samples is calculated */ 00107 outBlockSize = (2u * srcALen) - 1u; 00108 00109 /* When srcALen > srcBLen, zero padding is done to srcB 00110 * to make their lengths equal. 00111 * Instead, (outBlockSize - (srcALen + srcBLen - 1)) 00112 * number of output samples are made zero */ 00113 j = outBlockSize - (srcALen + (srcBLen - 1u)); 00114 00115 while(j > 0u) 00116 { 00117 /* Zero is stored in the destination buffer */ 00118 *pOut++ = 0; 00119 00120 /* Decrement the loop counter */ 00121 j--; 00122 } 00123 00124 } 00125 else 00126 { 00127 /* Initialization of inputA pointer */ 00128 pIn1 = (pSrcB); 00129 00130 /* Initialization of inputB pointer */ 00131 pIn2 = (pSrcA); 00132 00133 /* srcBLen is always considered as shorter or equal to srcALen */ 00134 j = srcBLen; 00135 srcBLen = srcALen; 00136 srcALen = j; 00137 00138 /* CORR(x, y) = Reverse order(CORR(y, x)) */ 00139 /* Hence set the destination pointer to point to the last output sample */ 00140 pOut = pDst + ((srcALen + srcBLen) - 2u); 00141 00142 /* Destination address modifier is set to -1 */ 00143 inc = -1; 00144 00145 } 00146 00147 /* The function is internally 00148 * divided into three parts according to the number of multiplications that has to be 00149 * taken place between inputA samples and inputB samples. In the first part of the 00150 * algorithm, the multiplications increase by one for every iteration. 00151 * In the second part of the algorithm, srcBLen number of multiplications are done. 00152 * In the third part of the algorithm, the multiplications decrease by one 00153 * for every iteration.*/ 00154 /* The algorithm is implemented in three stages. 00155 * The loop counters of each stage is initiated here. */ 00156 blockSize1 = srcBLen - 1u; 00157 blockSize2 = srcALen - (srcBLen - 1u); 00158 blockSize3 = blockSize1; 00159 00160 /* -------------------------- 00161 * Initializations of stage1 00162 * -------------------------*/ 00163 00164 /* sum = x[0] * y[srcBlen - 1] 00165 * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1] 00166 * .... 00167 * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1] 00168 */ 00169 00170 /* In this stage the MAC operations are increased by 1 for every iteration. 00171 The count variable holds the number of MAC operations performed */ 00172 count = 1u; 00173 00174 /* Working pointer of inputA */ 00175 px = pIn1; 00176 00177 /* Working pointer of inputB */ 00178 pSrc1 = pIn2 + (srcBLen - 1u); 00179 py = pSrc1; 00180 00181 /* ------------------------ 00182 * Stage1 process 00183 * ----------------------*/ 00184 00185 /* The first loop starts here */ 00186 while(blockSize1 > 0u) 00187 { 00188 /* Accumulator is made zero for every iteration */ 00189 sum = 0; 00190 00191 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00192 k = count >> 2; 00193 00194 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00195 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00196 while(k > 0u) 00197 { 00198 /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */ 00199 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00200 /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */ 00201 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00202 00203 /* Decrement the loop counter */ 00204 k--; 00205 } 00206 00207 /* If the count is not a multiple of 4, compute any remaining MACs here. 00208 ** No loop unrolling is used. */ 00209 k = count % 0x4u; 00210 00211 while(k > 0u) 00212 { 00213 /* Perform the multiply-accumulates */ 00214 /* x[0] * y[srcBLen - 1] */ 00215 sum = __SMLALD(*px++, *py++, sum); 00216 00217 /* Decrement the loop counter */ 00218 k--; 00219 } 00220 00221 /* Store the result in the accumulator in the destination buffer. */ 00222 *pOut = (q15_t) (__SSAT((sum >> 15), 16)); 00223 /* Destination pointer is updated according to the address modifier, inc */ 00224 pOut += inc; 00225 00226 /* Update the inputA and inputB pointers for next MAC calculation */ 00227 py = pSrc1 - count; 00228 px = pIn1; 00229 00230 /* Increment the MAC count */ 00231 count++; 00232 00233 /* Decrement the loop counter */ 00234 blockSize1--; 00235 } 00236 00237 /* -------------------------- 00238 * Initializations of stage2 00239 * ------------------------*/ 00240 00241 /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1] 00242 * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1] 00243 * .... 00244 * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00245 */ 00246 00247 /* Working pointer of inputA */ 00248 px = pIn1; 00249 00250 /* Working pointer of inputB */ 00251 py = pIn2; 00252 00253 /* Initialize inputB pointer of type q31 */ 00254 pb = (q31_t *) (py); 00255 00256 /* count is index by which the pointer pIn1 to be incremented */ 00257 count = 0u; 00258 00259 /* ------------------- 00260 * Stage2 process 00261 * ------------------*/ 00262 00263 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00264 * So, to loop unroll over blockSize2, 00265 * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */ 00266 if(srcBLen >= 4u) 00267 { 00268 /* Loop unroll over blockSize2, by 4 */ 00269 blkCnt = blockSize2 >> 2u; 00270 00271 while(blkCnt > 0u) 00272 { 00273 /* Set all accumulators to zero */ 00274 acc0 = 0; 00275 acc1 = 0; 00276 acc2 = 0; 00277 acc3 = 0; 00278 00279 /* read x[0], x[1] samples */ 00280 x0 = *(q31_t *) (px++); 00281 /* read x[1], x[2] samples */ 00282 x1 = *(q31_t *) (px++); 00283 00284 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00285 k = srcBLen >> 2u; 00286 00287 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00288 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00289 do 00290 { 00291 /* Read the first two inputB samples using SIMD: 00292 * y[0] and y[1] */ 00293 c0 = *(pb++); 00294 00295 /* acc0 += x[0] * y[0] + x[1] * y[1] */ 00296 acc0 = __SMLALD(x0, c0, acc0); 00297 00298 /* acc1 += x[1] * y[0] + x[2] * y[1] */ 00299 acc1 = __SMLALD(x1, c0, acc1); 00300 00301 /* Read x[2], x[3] */ 00302 x2 = *(q31_t *) (px++); 00303 00304 /* Read x[3], x[4] */ 00305 x3 = *(q31_t *) (px++); 00306 00307 /* acc2 += x[2] * y[0] + x[3] * y[1] */ 00308 acc2 = __SMLALD(x2, c0, acc2); 00309 00310 /* acc3 += x[3] * y[0] + x[4] * y[1] */ 00311 acc3 = __SMLALD(x3, c0, acc3); 00312 00313 /* Read y[2] and y[3] */ 00314 c0 = *(pb++); 00315 00316 /* acc0 += x[2] * y[2] + x[3] * y[3] */ 00317 acc0 = __SMLALD(x2, c0, acc0); 00318 00319 /* acc1 += x[3] * y[2] + x[4] * y[3] */ 00320 acc1 = __SMLALD(x3, c0, acc1); 00321 00322 /* Read x[4], x[5] */ 00323 x0 = *(q31_t *) (px++); 00324 00325 /* Read x[5], x[6] */ 00326 x1 = *(q31_t *) (px++); 00327 00328 /* acc2 += x[4] * y[2] + x[5] * y[3] */ 00329 acc2 = __SMLALD(x0, c0, acc2); 00330 00331 /* acc3 += x[5] * y[2] + x[6] * y[3] */ 00332 acc3 = __SMLALD(x1, c0, acc3); 00333 00334 } while(--k); 00335 00336 /* For the next MAC operations, SIMD is not used 00337 * So, the 16 bit pointer if inputB, py is updated */ 00338 py = (q15_t *) (pb); 00339 00340 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00341 ** No loop unrolling is used. */ 00342 k = srcBLen % 0x4u; 00343 00344 if(k == 1u) 00345 { 00346 /* Read y[4] */ 00347 c0 = *py; 00348 c0 = c0 & 0x0000FFFF; 00349 00350 /* Read x[7] */ 00351 x3 = *(q31_t *) px++; 00352 00353 /* Perform the multiply-accumulates */ 00354 acc0 = __SMLALD(x0, c0, acc0); 00355 acc1 = __SMLALD(x1, c0, acc1); 00356 acc2 = __SMLALDX(x1, c0, acc2); 00357 acc3 = __SMLALDX(x3, c0, acc3); 00358 } 00359 00360 if(k == 2u) 00361 { 00362 /* Read y[4], y[5] */ 00363 c0 = *(pb); 00364 00365 /* Read x[7], x[8] */ 00366 x3 = *(q31_t *) px++; 00367 00368 /* Read x[9] */ 00369 x2 = *(q31_t *) px++; 00370 00371 /* Perform the multiply-accumulates */ 00372 acc0 = __SMLALD(x0, c0, acc0); 00373 acc1 = __SMLALD(x1, c0, acc1); 00374 acc2 = __SMLALD(x3, c0, acc2); 00375 acc3 = __SMLALD(x2, c0, acc3); 00376 } 00377 00378 if(k == 3u) 00379 { 00380 /* Read y[4], y[5] */ 00381 c0 = *pb++; 00382 00383 /* Read x[7], x[8] */ 00384 x3 = *(q31_t *) px++; 00385 00386 /* Read x[9] */ 00387 x2 = *(q31_t *) px++; 00388 00389 /* Perform the multiply-accumulates */ 00390 acc0 = __SMLALD(x0, c0, acc0); 00391 acc1 = __SMLALD(x1, c0, acc1); 00392 acc2 = __SMLALD(x3, c0, acc2); 00393 acc3 = __SMLALD(x2, c0, acc3); 00394 00395 /* Read y[6] */ 00396 c0 = (q15_t) (*pb); 00397 c0 = c0 & 0x0000FFFF; 00398 00399 /* Read x[10] */ 00400 x3 = *(q31_t *) px++; 00401 00402 /* Perform the multiply-accumulates */ 00403 acc0 = __SMLALDX(x1, c0, acc0); 00404 acc1 = __SMLALD(x2, c0, acc1); 00405 acc2 = __SMLALDX(x2, c0, acc2); 00406 acc3 = __SMLALDX(x3, c0, acc3); 00407 } 00408 00409 /* Store the result in the accumulator in the destination buffer. */ 00410 *pOut = (q15_t) (__SSAT(acc0 >> 15, 16)); 00411 /* Destination pointer is updated according to the address modifier, inc */ 00412 pOut += inc; 00413 00414 *pOut = (q15_t) (__SSAT(acc1 >> 15, 16)); 00415 pOut += inc; 00416 00417 *pOut = (q15_t) (__SSAT(acc2 >> 15, 16)); 00418 pOut += inc; 00419 00420 *pOut = (q15_t) (__SSAT(acc3 >> 15, 16)); 00421 pOut += inc; 00422 00423 /* Increment the count by 4 as 4 output values are computed */ 00424 count += 4u; 00425 00426 /* Update the inputA and inputB pointers for next MAC calculation */ 00427 px = pIn1 + count; 00428 py = pIn2; 00429 pb = (q31_t *) (py); 00430 00431 00432 /* Decrement the loop counter */ 00433 blkCnt--; 00434 } 00435 00436 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00437 ** No loop unrolling is used. */ 00438 blkCnt = blockSize2 % 0x4u; 00439 00440 while(blkCnt > 0u) 00441 { 00442 /* Accumulator is made zero for every iteration */ 00443 sum = 0; 00444 00445 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00446 k = srcBLen >> 2u; 00447 00448 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00449 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00450 while(k > 0u) 00451 { 00452 /* Perform the multiply-accumulates */ 00453 sum += ((q63_t) * px++ * *py++); 00454 sum += ((q63_t) * px++ * *py++); 00455 sum += ((q63_t) * px++ * *py++); 00456 sum += ((q63_t) * px++ * *py++); 00457 00458 /* Decrement the loop counter */ 00459 k--; 00460 } 00461 00462 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00463 ** No loop unrolling is used. */ 00464 k = srcBLen % 0x4u; 00465 00466 while(k > 0u) 00467 { 00468 /* Perform the multiply-accumulates */ 00469 sum += ((q63_t) * px++ * *py++); 00470 00471 /* Decrement the loop counter */ 00472 k--; 00473 } 00474 00475 /* Store the result in the accumulator in the destination buffer. */ 00476 *pOut = (q15_t) (__SSAT(sum >> 15, 16)); 00477 /* Destination pointer is updated according to the address modifier, inc */ 00478 pOut += inc; 00479 00480 /* Increment count by 1, as one output value is computed */ 00481 count++; 00482 00483 /* Update the inputA and inputB pointers for next MAC calculation */ 00484 px = pIn1 + count; 00485 py = pIn2; 00486 00487 /* Decrement the loop counter */ 00488 blkCnt--; 00489 } 00490 } 00491 else 00492 { 00493 /* If the srcBLen is not a multiple of 4, 00494 * the blockSize2 loop cannot be unrolled by 4 */ 00495 blkCnt = blockSize2; 00496 00497 while(blkCnt > 0u) 00498 { 00499 /* Accumulator is made zero for every iteration */ 00500 sum = 0; 00501 00502 /* Loop over srcBLen */ 00503 k = srcBLen; 00504 00505 while(k > 0u) 00506 { 00507 /* Perform the multiply-accumulate */ 00508 sum += ((q63_t) * px++ * *py++); 00509 00510 /* Decrement the loop counter */ 00511 k--; 00512 } 00513 00514 /* Store the result in the accumulator in the destination buffer. */ 00515 *pOut = (q15_t) (__SSAT(sum >> 15, 16)); 00516 /* Destination pointer is updated according to the address modifier, inc */ 00517 pOut += inc; 00518 00519 /* Increment the MAC count */ 00520 count++; 00521 00522 /* Update the inputA and inputB pointers for next MAC calculation */ 00523 px = pIn1 + count; 00524 py = pIn2; 00525 00526 /* Decrement the loop counter */ 00527 blkCnt--; 00528 } 00529 } 00530 00531 /* -------------------------- 00532 * Initializations of stage3 00533 * -------------------------*/ 00534 00535 /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00536 * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1] 00537 * .... 00538 * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1] 00539 * sum += x[srcALen-1] * y[0] 00540 */ 00541 00542 /* In this stage the MAC operations are decreased by 1 for every iteration. 00543 The count variable holds the number of MAC operations performed */ 00544 count = srcBLen - 1u; 00545 00546 /* Working pointer of inputA */ 00547 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00548 px = pSrc1; 00549 00550 /* Working pointer of inputB */ 00551 py = pIn2; 00552 00553 /* ------------------- 00554 * Stage3 process 00555 * ------------------*/ 00556 00557 while(blockSize3 > 0u) 00558 { 00559 /* Accumulator is made zero for every iteration */ 00560 sum = 0; 00561 00562 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00563 k = count >> 2u; 00564 00565 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00566 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00567 while(k > 0u) 00568 { 00569 /* Perform the multiply-accumulates */ 00570 /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */ 00571 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00572 /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */ 00573 sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum); 00574 00575 /* Decrement the loop counter */ 00576 k--; 00577 } 00578 00579 /* If the count is not a multiple of 4, compute any remaining MACs here. 00580 ** No loop unrolling is used. */ 00581 k = count % 0x4u; 00582 00583 while(k > 0u) 00584 { 00585 /* Perform the multiply-accumulates */ 00586 sum = __SMLALD(*px++, *py++, sum); 00587 00588 /* Decrement the loop counter */ 00589 k--; 00590 } 00591 00592 /* Store the result in the accumulator in the destination buffer. */ 00593 *pOut = (q15_t) (__SSAT((sum >> 15), 16)); 00594 /* Destination pointer is updated according to the address modifier, inc */ 00595 pOut += inc; 00596 00597 /* Update the inputA and inputB pointers for next MAC calculation */ 00598 px = ++pSrc1; 00599 py = pIn2; 00600 00601 /* Decrement the MAC count */ 00602 count--; 00603 00604 /* Decrement the loop counter */ 00605 blockSize3--; 00606 } 00607 00608 } 00609  All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines Generated on Mon Nov 29 2010 17:19:56 for CMSIS DSP Software Library by  1.7.2

Wyszukiwarka

Podobne podstrony:
arm conv q15? source
arm shift q15? source
arm correlate q15?
arm scale q15? source
arm sin q15? source
arm rms q15? source
arm mult q15? source
arm correlate ?2? source
arm sub q15? source
arm correlate q31? source
arm copy q15? source
arm min q15? source
arm std q15? source
arm ?s q15? source
arm correlate q7? source
arm var q15? source
arm negate q15? source
arm ?t4 q15? source
arm lms q15? source

więcej podobnych podstron