arm conv partial fast q31 8c source


CMSIS DSP Software Library: arm_conv_partial_fast_q31.c Source File Main Page Modules Data Structures Files Examples File List Globals arm_conv_partial_fast_q31.c Go to the documentation of this file.00001 /* ---------------------------------------------------------------------- 00002 * Copyright (C) 2010 ARM Limited. All rights reserved. 00003 * 00004 * $Date: 29. November 2010 00005 * $Revision: V1.0.3 00006 * 00007 * Project: CMSIS DSP Library 00008 * Title: arm_conv_partial_fast_q31.c 00009 * 00010 * Description: Fast Q31 Partial convolution. 00011 * 00012 * Target Processor: Cortex-M4/Cortex-M3 00013 * 00014 * Version 1.0.3 2010/11/29 00015 * Re-organized the CMSIS folders and updated documentation. 00016 * 00017 * Version 1.0.2 2010/11/11 00018 * Documentation updated. 00019 * 00020 * Version 1.0.1 2010/10/05 00021 * Production release and review comments incorporated. 00022 * 00023 * Version 1.0.0 2010/09/20 00024 * Production release and review comments incorporated. 00025 * -------------------------------------------------------------------- */ 00026 00027 #include "arm_math.h" 00028 00053 arm_status arm_conv_partial_fast_q31( 00054 q31_t * pSrcA, 00055 uint32_t srcALen, 00056 q31_t * pSrcB, 00057 uint32_t srcBLen, 00058 q31_t * pDst, 00059 uint32_t firstIndex, 00060 uint32_t numPoints) 00061 { 00062 q31_t *pIn1; /* inputA pointer */ 00063 q31_t *pIn2; /* inputB pointer */ 00064 q31_t *pOut = pDst; /* output pointer */ 00065 q31_t *px; /* Intermediate inputA pointer */ 00066 q31_t *py; /* Intermediate inputB pointer */ 00067 q31_t *pSrc1, *pSrc2; /* Intermediate pointers */ 00068 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */ 00069 q31_t x0, x1, x2, x3, c0; 00070 uint32_t j, k, count, check, blkCnt; 00071 int32_t blockSize1, blockSize2, blockSize3; /* loop counters */ 00072 arm_status status; /* status of Partial convolution */ 00073 00074 00075 /* Check for range of output samples to be calculated */ 00076 if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u)))) 00077 { 00078 /* Set status as ARM_MATH_ARGUMENT_ERROR */ 00079 status = ARM_MATH_ARGUMENT_ERROR; 00080 } 00081 else 00082 { 00083 00084 /* The algorithm implementation is based on the lengths of the inputs. */ 00085 /* srcB is always made to slide across srcA. */ 00086 /* So srcBLen is always considered as shorter or equal to srcALen */ 00087 if(srcALen >= srcBLen) 00088 { 00089 /* Initialization of inputA pointer */ 00090 pIn1 = pSrcA; 00091 00092 /* Initialization of inputB pointer */ 00093 pIn2 = pSrcB; 00094 } 00095 else 00096 { 00097 /* Initialization of inputA pointer */ 00098 pIn1 = pSrcB; 00099 00100 /* Initialization of inputB pointer */ 00101 pIn2 = pSrcA; 00102 00103 /* srcBLen is always considered as shorter or equal to srcALen */ 00104 j = srcBLen; 00105 srcBLen = srcALen; 00106 srcALen = j; 00107 } 00108 00109 /* Conditions to check which loopCounter holds 00110 * the first and last indices of the output samples to be calculated. */ 00111 check = firstIndex + numPoints; 00112 blockSize3 = ((int32_t) check - (int32_t) srcALen); 00113 blockSize3 = (blockSize3 > 0) ? blockSize3 : 0; 00114 blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex); 00115 blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 : 00116 (int32_t) numPoints) : 0; 00117 blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + 00118 (int32_t) firstIndex); 00119 blockSize2 = (blockSize2 > 0) ? blockSize2 : 0; 00120 00121 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */ 00122 /* The function is internally 00123 * divided into three stages according to the number of multiplications that has to be 00124 * taken place between inputA samples and inputB samples. In the first stage of the 00125 * algorithm, the multiplications increase by one for every iteration. 00126 * In the second stage of the algorithm, srcBLen number of multiplications are done. 00127 * In the third stage of the algorithm, the multiplications decrease by one 00128 * for every iteration. */ 00129 00130 /* Set the output pointer to point to the firstIndex 00131 * of the output sample to be calculated. */ 00132 pOut = pDst + firstIndex; 00133 00134 /* -------------------------- 00135 * Initializations of stage1 00136 * -------------------------*/ 00137 00138 /* sum = x[0] * y[0] 00139 * sum = x[0] * y[1] + x[1] * y[0] 00140 * .... 00141 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0] 00142 */ 00143 00144 /* In this stage the MAC operations are increased by 1 for every iteration. 00145 The count variable holds the number of MAC operations performed. 00146 Since the partial convolution starts from firstIndex 00147 Number of Macs to be performed is firstIndex + 1 */ 00148 count = 1u + firstIndex; 00149 00150 /* Working pointer of inputA */ 00151 px = pIn1; 00152 00153 /* Working pointer of inputB */ 00154 pSrc2 = pIn2 + firstIndex; 00155 py = pSrc2; 00156 00157 /* ------------------------ 00158 * Stage1 process 00159 * ----------------------*/ 00160 00161 /* The first loop starts here */ 00162 while(blockSize1 > 0) 00163 { 00164 /* Accumulator is made zero for every iteration */ 00165 sum = 0; 00166 00167 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00168 k = count >> 2u; 00169 00170 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00171 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00172 while(k > 0u) 00173 { 00174 /* x[0] * y[srcBLen - 1] */ 00175 sum = (q31_t) ((((q63_t) sum << 32) + 00176 ((q63_t) * px++ * (*py--))) >> 32); 00177 00178 /* x[1] * y[srcBLen - 2] */ 00179 sum = (q31_t) ((((q63_t) sum << 32) + 00180 ((q63_t) * px++ * (*py--))) >> 32); 00181 00182 /* x[2] * y[srcBLen - 3] */ 00183 sum = (q31_t) ((((q63_t) sum << 32) + 00184 ((q63_t) * px++ * (*py--))) >> 32); 00185 00186 /* x[3] * y[srcBLen - 4] */ 00187 sum = (q31_t) ((((q63_t) sum << 32) + 00188 ((q63_t) * px++ * (*py--))) >> 32); 00189 00190 /* Decrement the loop counter */ 00191 k--; 00192 } 00193 00194 /* If the count is not a multiple of 4, compute any remaining MACs here. 00195 ** No loop unrolling is used. */ 00196 k = count % 0x4u; 00197 00198 while(k > 0u) 00199 { 00200 /* Perform the multiply-accumulates */ 00201 sum = (q31_t) ((((q63_t) sum << 32) + 00202 ((q63_t) * px++ * (*py--))) >> 32); 00203 00204 /* Decrement the loop counter */ 00205 k--; 00206 } 00207 00208 /* Store the result in the accumulator in the destination buffer. */ 00209 *pOut++ = sum << 1; 00210 00211 /* Update the inputA and inputB pointers for next MAC calculation */ 00212 py = ++pSrc2; 00213 px = pIn1; 00214 00215 /* Increment the MAC count */ 00216 count++; 00217 00218 /* Decrement the loop counter */ 00219 blockSize1--; 00220 } 00221 00222 /* -------------------------- 00223 * Initializations of stage2 00224 * ------------------------*/ 00225 00226 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0] 00227 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0] 00228 * .... 00229 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0] 00230 */ 00231 00232 /* Working pointer of inputA */ 00233 px = pIn1; 00234 00235 /* Working pointer of inputB */ 00236 pSrc2 = pIn2 + (srcBLen - 1u); 00237 py = pSrc2; 00238 00239 /* count is index by which the pointer pIn1 to be incremented */ 00240 count = 1u; 00241 00242 /* ------------------- 00243 * Stage2 process 00244 * ------------------*/ 00245 00246 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed. 00247 * So, to loop unroll over blockSize2, 00248 * srcBLen should be greater than or equal to 4 */ 00249 if(srcBLen >= 4u) 00250 { 00251 /* Loop unroll over blockSize2 */ 00252 blkCnt = ((uint32_t) blockSize2 >> 2u); 00253 00254 while(blkCnt > 0u) 00255 { 00256 /* Set all accumulators to zero */ 00257 acc0 = 0; 00258 acc1 = 0; 00259 acc2 = 0; 00260 acc3 = 0; 00261 00262 /* read x[0], x[1], x[2] samples */ 00263 x0 = *(px++); 00264 x1 = *(px++); 00265 x2 = *(px++); 00266 00267 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00268 k = srcBLen >> 2u; 00269 00270 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00271 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00272 do 00273 { 00274 /* Read y[srcBLen - 1] sample */ 00275 c0 = *(py--); 00276 00277 /* Read x[3] sample */ 00278 x3 = *(px++); 00279 00280 /* Perform the multiply-accumulate */ 00281 /* acc0 += x[0] * y[srcBLen - 1] */ 00282 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00283 00284 /* acc1 += x[1] * y[srcBLen - 1] */ 00285 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00286 00287 /* acc2 += x[2] * y[srcBLen - 1] */ 00288 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00289 00290 /* acc3 += x[3] * y[srcBLen - 1] */ 00291 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00292 00293 /* Read y[srcBLen - 2] sample */ 00294 c0 = *(py--); 00295 00296 /* Read x[4] sample */ 00297 x0 = *(px++); 00298 00299 /* Perform the multiply-accumulate */ 00300 /* acc0 += x[1] * y[srcBLen - 2] */ 00301 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32); 00302 /* acc1 += x[2] * y[srcBLen - 2] */ 00303 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32); 00304 /* acc2 += x[3] * y[srcBLen - 2] */ 00305 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32); 00306 /* acc3 += x[4] * y[srcBLen - 2] */ 00307 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32); 00308 00309 /* Read y[srcBLen - 3] sample */ 00310 c0 = *(py--); 00311 00312 /* Read x[5] sample */ 00313 x1 = *(px++); 00314 00315 /* Perform the multiply-accumulates */ 00316 /* acc0 += x[2] * y[srcBLen - 3] */ 00317 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32); 00318 /* acc1 += x[3] * y[srcBLen - 2] */ 00319 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32); 00320 /* acc2 += x[4] * y[srcBLen - 2] */ 00321 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32); 00322 /* acc3 += x[5] * y[srcBLen - 2] */ 00323 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32); 00324 00325 /* Read y[srcBLen - 4] sample */ 00326 c0 = *(py--); 00327 00328 /* Read x[6] sample */ 00329 x2 = *(px++); 00330 00331 /* Perform the multiply-accumulates */ 00332 /* acc0 += x[3] * y[srcBLen - 4] */ 00333 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32); 00334 /* acc1 += x[4] * y[srcBLen - 4] */ 00335 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32); 00336 /* acc2 += x[5] * y[srcBLen - 4] */ 00337 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32); 00338 /* acc3 += x[6] * y[srcBLen - 4] */ 00339 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32); 00340 00341 00342 } while(--k); 00343 00344 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00345 ** No loop unrolling is used. */ 00346 k = srcBLen % 0x4u; 00347 00348 while(k > 0u) 00349 { 00350 /* Read y[srcBLen - 5] sample */ 00351 c0 = *(py--); 00352 00353 /* Read x[7] sample */ 00354 x3 = *(px++); 00355 00356 /* Perform the multiply-accumulates */ 00357 /* acc0 += x[4] * y[srcBLen - 5] */ 00358 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32); 00359 /* acc1 += x[5] * y[srcBLen - 5] */ 00360 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32); 00361 /* acc2 += x[6] * y[srcBLen - 5] */ 00362 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32); 00363 /* acc3 += x[7] * y[srcBLen - 5] */ 00364 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32); 00365 00366 /* Reuse the present samples for the next MAC */ 00367 x0 = x1; 00368 x1 = x2; 00369 x2 = x3; 00370 00371 /* Decrement the loop counter */ 00372 k--; 00373 } 00374 00375 /* Store the result in the accumulator in the destination buffer. */ 00376 *pOut++ = (q31_t) (acc0 << 1); 00377 *pOut++ = (q31_t) (acc1 << 1); 00378 *pOut++ = (q31_t) (acc2 << 1); 00379 *pOut++ = (q31_t) (acc3 << 1); 00380 00381 /* Update the inputA and inputB pointers for next MAC calculation */ 00382 px = pIn1 + (count * 4u); 00383 py = pSrc2; 00384 00385 /* Increment the pointer pIn1 index, count by 1 */ 00386 count++; 00387 00388 /* Decrement the loop counter */ 00389 blkCnt--; 00390 } 00391 00392 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here. 00393 ** No loop unrolling is used. */ 00394 blkCnt = (uint32_t) blockSize2 % 0x4u; 00395 00396 while(blkCnt > 0u) 00397 { 00398 /* Accumulator is made zero for every iteration */ 00399 sum = 0; 00400 00401 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00402 k = srcBLen >> 2u; 00403 00404 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00405 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00406 while(k > 0u) 00407 { 00408 /* Perform the multiply-accumulates */ 00409 sum = (q31_t) ((((q63_t) sum << 32) + 00410 ((q63_t) * px++ * (*py--))) >> 32); 00411 sum = (q31_t) ((((q63_t) sum << 32) + 00412 ((q63_t) * px++ * (*py--))) >> 32); 00413 sum = (q31_t) ((((q63_t) sum << 32) + 00414 ((q63_t) * px++ * (*py--))) >> 32); 00415 sum = (q31_t) ((((q63_t) sum << 32) + 00416 ((q63_t) * px++ * (*py--))) >> 32); 00417 00418 /* Decrement the loop counter */ 00419 k--; 00420 } 00421 00422 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here. 00423 ** No loop unrolling is used. */ 00424 k = srcBLen % 0x4u; 00425 00426 while(k > 0u) 00427 { 00428 /* Perform the multiply-accumulate */ 00429 sum = (q31_t) ((((q63_t) sum << 32) + 00430 ((q63_t) * px++ * (*py--))) >> 32); 00431 00432 /* Decrement the loop counter */ 00433 k--; 00434 } 00435 00436 /* Store the result in the accumulator in the destination buffer. */ 00437 *pOut++ = sum << 1; 00438 00439 /* Update the inputA and inputB pointers for next MAC calculation */ 00440 px = pIn1 + count; 00441 py = pSrc2; 00442 00443 /* Increment the MAC count */ 00444 count++; 00445 00446 /* Decrement the loop counter */ 00447 blkCnt--; 00448 } 00449 } 00450 else 00451 { 00452 /* If the srcBLen is not a multiple of 4, 00453 * the blockSize2 loop cannot be unrolled by 4 */ 00454 blkCnt = (uint32_t) blockSize2; 00455 00456 while(blkCnt > 0u) 00457 { 00458 /* Accumulator is made zero for every iteration */ 00459 sum = 0; 00460 00461 /* srcBLen number of MACS should be performed */ 00462 k = srcBLen; 00463 00464 while(k > 0u) 00465 { 00466 /* Perform the multiply-accumulate */ 00467 sum = (q31_t) ((((q63_t) sum << 32) + 00468 ((q63_t) * px++ * (*py--))) >> 32); 00469 00470 /* Decrement the loop counter */ 00471 k--; 00472 } 00473 00474 /* Store the result in the accumulator in the destination buffer. */ 00475 *pOut++ = sum << 1; 00476 00477 /* Update the inputA and inputB pointers for next MAC calculation */ 00478 px = pIn1 + count; 00479 py = pSrc2; 00480 00481 /* Increment the MAC count */ 00482 count++; 00483 00484 /* Decrement the loop counter */ 00485 blkCnt--; 00486 } 00487 } 00488 00489 00490 /* -------------------------- 00491 * Initializations of stage3 00492 * -------------------------*/ 00493 00494 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1] 00495 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2] 00496 * .... 00497 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2] 00498 * sum += x[srcALen-1] * y[srcBLen-1] 00499 */ 00500 00501 /* In this stage the MAC operations are decreased by 1 for every iteration. 00502 The count variable holds the number of MAC operations performed */ 00503 count = srcBLen - 1u; 00504 00505 /* Working pointer of inputA */ 00506 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u); 00507 px = pSrc1; 00508 00509 /* Working pointer of inputB */ 00510 pSrc2 = pIn2 + (srcBLen - 1u); 00511 py = pSrc2; 00512 00513 /* ------------------- 00514 * Stage3 process 00515 * ------------------*/ 00516 00517 while(blockSize3 > 0) 00518 { 00519 /* Accumulator is made zero for every iteration */ 00520 sum = 0; 00521 00522 /* Apply loop unrolling and compute 4 MACs simultaneously. */ 00523 k = count >> 2u; 00524 00525 /* First part of the processing with loop unrolling. Compute 4 MACs at a time. 00526 ** a second loop below computes MACs for the remaining 1 to 3 samples. */ 00527 while(k > 0u) 00528 { 00529 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */ 00530 sum = (q31_t) ((((q63_t) sum << 32) + 00531 ((q63_t) * px++ * (*py--))) >> 32); 00532 00533 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */ 00534 sum = (q31_t) ((((q63_t) sum << 32) + 00535 ((q63_t) * px++ * (*py--))) >> 32); 00536 00537 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */ 00538 sum = (q31_t) ((((q63_t) sum << 32) + 00539 ((q63_t) * px++ * (*py--))) >> 32); 00540 00541 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */ 00542 sum = (q31_t) ((((q63_t) sum << 32) + 00543 ((q63_t) * px++ * (*py--))) >> 32); 00544 00545 /* Decrement the loop counter */ 00546 k--; 00547 } 00548 00549 /* If the count is not a multiple of 4, compute any remaining MACs here. 00550 ** No loop unrolling is used. */ 00551 k = count % 0x4u; 00552 00553 while(k > 0u) 00554 { 00555 /* Perform the multiply-accumulates */ 00556 /* sum += x[srcALen-1] * y[srcBLen-1] */ 00557 sum = (q31_t) ((((q63_t) sum << 32) + 00558 ((q63_t) * px++ * (*py--))) >> 32); 00559 00560 /* Decrement the loop counter */ 00561 k--; 00562 } 00563 00564 /* Store the result in the accumulator in the destination buffer. */ 00565 *pOut++ = sum << 1; 00566 00567 /* Update the inputA and inputB pointers for next MAC calculation */ 00568 px = ++pSrc1; 00569 py = pSrc2; 00570 00571 /* Decrement the MAC count */ 00572 count--; 00573 00574 /* Decrement the loop counter */ 00575 blockSize3--; 00576 00577 } 00578 00579 /* set status as ARM_MATH_SUCCESS */ 00580 status = ARM_MATH_SUCCESS; 00581 } 00582 00583 /* Return to application */ 00584 return (status); 00585 00586 } 00587  All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Defines Generated on Mon Nov 29 2010 17:19:56 for CMSIS DSP Software Library by  1.7.2

Wyszukiwarka

Podobne podstrony:
arm conv partial ?st q15? source
arm conv partial ?st q31?
arm conv partial ?st q15?
arm mat mult ?st q31? source
arm fir ?cimate ?st q31? source
arm conv partial q31? source
arm conv ?st q31? source
arm biquad ?scade ?1 ?st q31? source
arm conv partial q7? source
arm conv partial q15? source
arm correlate ?st q31? source
arm conv partial q31?
arm fir ?st q31? source
arm conv partial ?2? source
arm fir lattice init q31? source
arm fir ?cimate ?st q15? source
arm cmplx dot prod q31? source
arm conv partial q15?
arm fir ?cimate init q31? source

więcej podobnych podstron