init

2023-10-03 16:40:14 +08:00
commit df1cef55ca
1022 changed files with 487241 additions and 0 deletions
@@ -0,0 +1,170 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_max_f32.c
+ * Description:  Maximum value of a floating-point vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup Max Maximum
+ *
+ * Computes the maximum value of an array of data.
+ * The function returns both the maximum value and its position within the array.
+ * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ * @addtogroup Max
+ * @{
+ */
+
+
+/**
+ * @brief Maximum value of a floating-point vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult maximum value returned here
+ * @param[out]      *pIndex index of maximum value returned here
+ * @return none.
+ */
+
+void arm_max_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult,
+  uint32_t * pIndex)
+{
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  float32_t maxVal1, maxVal2, out;               /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex, count;              /* loop counter */
+
+  /* Initialise the count value. */
+  count = 0U;
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  /* Loop unrolling */
+  blkCnt = (blockSize - 1U) >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+    maxVal2 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal1;
+      outIndex = count + 1U;
+    }
+
+    /* compare for the maximum value */
+    if (out < maxVal2)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal2;
+      outIndex = count + 2U;
+    }
+
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+    maxVal2 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal1;
+      outIndex = count + 3U;
+    }
+
+    /* compare for the maximum value */
+    if (out < maxVal2)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal2;
+      outIndex = count + 4U;
+    }
+
+    count += 4U;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* if (blockSize - 1U) is not multiple of 4 */
+  blkCnt = (blockSize - 1U) % 4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  float32_t maxVal1, out;                        /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex;                     /* loop counter */
+
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and it's index */
+      out = maxVal1;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Store the maximum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+
+/**
+ * @} end of Max group
+ */
@@ -0,0 +1,162 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_max_q15.c
+ * Description:  Maximum value of a Q15 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup Max
+ * @{
+ */
+
+
+/**
+ * @brief Maximum value of a Q15 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult maximum value returned here
+ * @param[out]      *pIndex index of maximum value returned here
+ * @return none.
+ */
+
+void arm_max_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult,
+  uint32_t * pIndex)
+{
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q15_t maxVal1, maxVal2, out;                   /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex, count;              /* loop counter */
+
+  /* Initialise the count value. */
+  count = 0U;
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  /* Loop unrolling */
+  blkCnt = (blockSize - 1U) >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+    maxVal2 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal1;
+      outIndex = count + 1U;
+    }
+
+    /* compare for the maximum value */
+    if (out < maxVal2)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal2;
+      outIndex = count + 2U;
+    }
+
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+    maxVal2 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal1;
+      outIndex = count + 3U;
+    }
+
+    /* compare for the maximum value */
+    if (out < maxVal2)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal2;
+      outIndex = count + 4U;
+    }
+
+    count += 4U;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* if (blockSize - 1U) is not multiple of 4 */
+  blkCnt = (blockSize - 1U) % 4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  q15_t maxVal1, out;                            /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex;                     /* loop counter */
+
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and it's index */
+      out = maxVal1;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Store the maximum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+
+/**
+ * @} end of Max group
+ */
@@ -0,0 +1,162 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_max_q31.c
+ * Description:  Maximum value of a Q31 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup Max
+ * @{
+ */
+
+
+/**
+ * @brief Maximum value of a Q31 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult maximum value returned here
+ * @param[out]      *pIndex index of maximum value returned here
+ * @return none.
+ */
+
+void arm_max_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult,
+  uint32_t * pIndex)
+{
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q31_t maxVal1, maxVal2, out;                   /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex, count;              /* loop counter */
+
+  /* Initialise the count value. */
+  count = 0U;
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  /* Loop unrolling */
+  blkCnt = (blockSize - 1U) >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+    maxVal2 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal1;
+      outIndex = count + 1U;
+    }
+
+    /* compare for the maximum value */
+    if (out < maxVal2)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal2;
+      outIndex = count + 2U;
+    }
+
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+    maxVal2 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal1;
+      outIndex = count + 3U;
+    }
+
+    /* compare for the maximum value */
+    if (out < maxVal2)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal2;
+      outIndex = count + 4U;
+    }
+
+    count += 4U;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* if (blockSize - 1U) is not multiple of 4 */
+  blkCnt = (blockSize - 1U) % 4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  q31_t maxVal1, out;                            /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex;                     /* loop counter */
+
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and it's index */
+      out = maxVal1;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Store the maximum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+
+/**
+ * @} end of Max group
+ */
@@ -0,0 +1,162 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_max_q7.c
+ * Description:  Maximum value of a Q7 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup Max
+ * @{
+ */
+
+
+/**
+ * @brief Maximum value of a Q7 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult maximum value returned here
+ * @param[out]      *pIndex index of maximum value returned here
+ * @return none.
+ */
+
+void arm_max_q7(
+  q7_t * pSrc,
+  uint32_t blockSize,
+  q7_t * pResult,
+  uint32_t * pIndex)
+{
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q7_t maxVal1, maxVal2, out;                    /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex, count;              /* loop counter */
+
+  /* Initialise the count value. */
+  count = 0U;
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  /* Loop unrolling */
+  blkCnt = (blockSize - 1U) >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+    maxVal2 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal1;
+      outIndex = count + 1U;
+    }
+
+    /* compare for the maximum value */
+    if (out < maxVal2)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal2;
+      outIndex = count + 2U;
+    }
+
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+    maxVal2 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal1;
+      outIndex = count + 3U;
+    }
+
+    /* compare for the maximum value */
+    if (out < maxVal2)
+    {
+      /* Update the maximum value and its index */
+      out = maxVal2;
+      outIndex = count + 4U;
+    }
+
+    count += 4U;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* if (blockSize - 1U) is not multiple of 4 */
+  blkCnt = (blockSize - 1U) % 4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  q7_t maxVal1, out;                             /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex;                     /* loop counter */
+
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize maxVal to the next consecutive values one by one */
+    maxVal1 = *pSrc++;
+
+    /* compare for the maximum value */
+    if (out < maxVal1)
+    {
+      /* Update the maximum value and it's index */
+      out = maxVal1;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Store the maximum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+
+/**
+ * @} end of Max group
+ */
@@ -0,0 +1,125 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mean_f32.c
+ * Description:  Mean value of a floating-point vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup mean Mean
+ *
+ * Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
+ * The underlying algorithm is used:
+ *
+ * <pre>
+ * 	Result = (pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]) / blockSize;
+ * </pre>
+ *
+ * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ * @addtogroup mean
+ * @{
+ */
+
+
+/**
+ * @brief Mean value of a floating-point vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult mean value returned here
+ * @return none.
+ */
+
+void arm_mean_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult)
+{
+  float32_t sum = 0.0f;                          /* Temporary result storage */
+  uint32_t blkCnt;                               /* loop counter */
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  float32_t in1, in2, in3, in4;
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    in1 = *pSrc++;
+    in2 = *pSrc++;
+    in3 = *pSrc++;
+    in4 = *pSrc++;
+
+    sum += in1;
+    sum += in2;
+    sum += in3;
+    sum += in4;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    sum += *pSrc++;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize  */
+  /* Store the result to the destination */
+  *pResult = sum / (float32_t) blockSize;
+}
+
+/**
+ * @} end of mean group
+ */
@@ -0,0 +1,120 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mean_q15.c
+ * Description:  Mean value of a Q15 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup mean
+ * @{
+ */
+
+
+/**
+ * @brief Mean value of a Q15 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult mean value returned here
+ * @return none.
+ *
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * The input is represented in 1.15 format and is accumulated in a 32-bit
+ * accumulator in 17.15 format.
+ * There is no risk of internal overflow with this approach, and the
+ * full precision of intermediate result is preserved.
+ * Finally, the accumulator is saturated and truncated to yield a result of 1.15 format.
+ *
+ */
+
+void arm_mean_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult)
+{
+  q31_t sum = 0;                                 /* Temporary result storage */
+  uint32_t blkCnt;                               /* loop counter */
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q31_t in;
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    in = *__SIMD32(pSrc)++;
+    sum += ((in << 16U) >> 16U);
+    sum +=  (in >> 16U);
+    in = *__SIMD32(pSrc)++;
+    sum += ((in << 16U) >> 16U);
+    sum +=  (in >> 16U);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    sum += *pSrc++;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize  */
+  /* Store the result to the destination */
+  *pResult = (q15_t) (sum / (q31_t)blockSize);
+}
+
+/**
+ * @} end of mean group
+ */
@@ -0,0 +1,123 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mean_q31.c
+ * Description:  Mean value of a Q31 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup mean
+ * @{
+ */
+
+
+/**
+ * @brief Mean value of a Q31 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult mean value returned here
+ * @return none.
+ *
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *\par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.31 format and is accumulated in a 64-bit
+ * accumulator in 33.31 format.
+ * There is no risk of internal overflow with this approach, and the
+ * full precision of intermediate result is preserved.
+ * Finally, the accumulator is truncated to yield a result of 1.31 format.
+ *
+ */
+
+void arm_mean_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult)
+{
+  q63_t sum = 0;                                 /* Temporary result storage */
+  uint32_t blkCnt;                               /* loop counter */
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q31_t in1, in2, in3, in4;
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    in1 = *pSrc++;
+    in2 = *pSrc++;
+    in3 = *pSrc++;
+    in4 = *pSrc++;
+
+    sum += in1;
+    sum += in2;
+    sum += in3;
+    sum += in4;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    sum += *pSrc++;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize  */
+  /* Store the result to the destination */
+  *pResult = (q31_t) (sum / (int32_t) blockSize);
+}
+
+/**
+ * @} end of mean group
+ */
@@ -0,0 +1,120 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_mean_q7.c
+ * Description:  Mean value of a Q7 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup mean
+ * @{
+ */
+
+
+/**
+ * @brief Mean value of a Q7 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult mean value returned here
+ * @return none.
+ *
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * The input is represented in 1.7 format and is accumulated in a 32-bit
+ * accumulator in 25.7 format.
+ * There is no risk of internal overflow with this approach, and the
+ * full precision of intermediate result is preserved.
+ * Finally, the accumulator is truncated to yield a result of 1.7 format.
+ *
+ */
+
+void arm_mean_q7(
+  q7_t * pSrc,
+  uint32_t blockSize,
+  q7_t * pResult)
+{
+  q31_t sum = 0;                                 /* Temporary result storage */
+  uint32_t blkCnt;                               /* loop counter */
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q31_t in;
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    in = *__SIMD32(pSrc)++;
+
+    sum += ((in << 24U) >> 24U);
+    sum += ((in << 16U) >> 24U);
+    sum += ((in <<  8U) >> 24U);
+    sum +=  (in >> 24U);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    sum += *pSrc++;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize  */
+  /* Store the result to the destination */
+  *pResult = (q7_t) (sum / (int32_t) blockSize);
+}
+
+/**
+ * @} end of mean group
+ */
@@ -0,0 +1,170 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_min_f32.c
+ * Description:  Minimum value of a floating-point vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup Min Minimum
+ *
+ * Computes the minimum value of an array of data.
+ * The function returns both the minimum value and its position within the array.
+ * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ * @addtogroup Min
+ * @{
+ */
+
+
+/**
+ * @brief Minimum value of a floating-point vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult minimum value returned here
+ * @param[out]      *pIndex index of minimum value returned here
+ * @return none.
+ */
+
+void arm_min_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult,
+  uint32_t * pIndex)
+{
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  float32_t minVal1, minVal2, out;               /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex, count;              /* loop counter */
+
+  /* Initialise the count value. */
+  count = 0U;
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  /* Loop unrolling */
+  blkCnt = (blockSize - 1U) >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+    minVal2 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and its index */
+      out = minVal1;
+      outIndex = count + 1U;
+    }
+
+    /* compare for the minimum value */
+    if (out > minVal2)
+    {
+      /* Update the minimum value and its index */
+      out = minVal2;
+      outIndex = count + 2U;
+    }
+
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+    minVal2 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and its index */
+      out = minVal1;
+      outIndex = count + 3U;
+    }
+
+    /* compare for the minimum value */
+    if (out > minVal2)
+    {
+      /* Update the minimum value and its index */
+      out = minVal2;
+      outIndex = count + 4U;
+    }
+
+    count += 4U;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* if (blockSize - 1U) is not multiple of 4 */
+  blkCnt = (blockSize - 1U) % 4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  float32_t minVal1, out;                        /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex;                     /* loop counter */
+
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and it's index */
+      out = minVal1;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Store the minimum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+
+/**
+ * @} end of Min group
+ */
@@ -0,0 +1,163 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_min_q15.c
+ * Description:  Minimum value of a Q15 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+
+/**
+ * @addtogroup Min
+ * @{
+ */
+
+
+/**
+ * @brief Minimum value of a Q15 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult minimum value returned here
+ * @param[out]      *pIndex index of minimum value returned here
+ * @return none.
+ */
+
+void arm_min_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult,
+  uint32_t * pIndex)
+{
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q15_t minVal1, minVal2, out;                   /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex, count;              /* loop counter */
+
+  /* Initialise the count value. */
+  count = 0U;
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  /* Loop unrolling */
+  blkCnt = (blockSize - 1U) >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+    minVal2 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and its index */
+      out = minVal1;
+      outIndex = count + 1U;
+    }
+
+    /* compare for the minimum value */
+    if (out > minVal2)
+    {
+      /* Update the minimum value and its index */
+      out = minVal2;
+      outIndex = count + 2U;
+    }
+
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+    minVal2 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and its index */
+      out = minVal1;
+      outIndex = count + 3U;
+    }
+
+    /* compare for the minimum value */
+    if (out > minVal2)
+    {
+      /* Update the minimum value and its index */
+      out = minVal2;
+      outIndex = count + 4U;
+    }
+
+    count += 4U;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* if (blockSize - 1U) is not multiple of 4 */
+  blkCnt = (blockSize - 1U) % 4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  q15_t minVal1, out;                            /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex;                     /* loop counter */
+
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and it's index */
+      out = minVal1;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Store the minimum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+
+/**
+ * @} end of Min group
+ */
@@ -0,0 +1,163 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_min_q31.c
+ * Description:  Minimum value of a Q31 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+
+/**
+ * @addtogroup Min
+ * @{
+ */
+
+
+/**
+ * @brief Minimum value of a Q31 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult minimum value returned here
+ * @param[out]      *pIndex index of minimum value returned here
+ * @return none.
+ */
+
+void arm_min_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult,
+  uint32_t * pIndex)
+{
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q31_t minVal1, minVal2, out;                   /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex, count;              /* loop counter */
+
+  /* Initialise the count value. */
+  count = 0U;
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  /* Loop unrolling */
+  blkCnt = (blockSize - 1U) >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+    minVal2 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and its index */
+      out = minVal1;
+      outIndex = count + 1U;
+    }
+
+    /* compare for the minimum value */
+    if (out > minVal2)
+    {
+      /* Update the minimum value and its index */
+      out = minVal2;
+      outIndex = count + 2U;
+    }
+
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+    minVal2 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and its index */
+      out = minVal1;
+      outIndex = count + 3U;
+    }
+
+    /* compare for the minimum value */
+    if (out > minVal2)
+    {
+      /* Update the minimum value and its index */
+      out = minVal2;
+      outIndex = count + 4U;
+    }
+
+    count += 4U;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* if (blockSize - 1U) is not multiple of 4 */
+  blkCnt = (blockSize - 1U) % 4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  q31_t minVal1, out;                            /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex;                     /* loop counter */
+
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and it's index */
+      out = minVal1;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Store the minimum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+
+/**
+ * @} end of Min group
+ */
@@ -0,0 +1,163 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_min_q7.c
+ * Description:  Minimum value of a Q7 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+
+/**
+ * @addtogroup Min
+ * @{
+ */
+
+
+/**
+ * @brief Minimum value of a Q7 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult minimum value returned here
+ * @param[out]      *pIndex index of minimum value returned here
+ * @return none.
+ */
+
+void arm_min_q7(
+  q7_t * pSrc,
+  uint32_t blockSize,
+  q7_t * pResult,
+  uint32_t * pIndex)
+{
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q7_t minVal1, minVal2, out;                    /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex, count;              /* loop counter */
+
+  /* Initialise the count value. */
+  count = 0U;
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  /* Loop unrolling */
+  blkCnt = (blockSize - 1U) >> 2U;
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+    minVal2 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and its index */
+      out = minVal1;
+      outIndex = count + 1U;
+    }
+
+    /* compare for the minimum value */
+    if (out > minVal2)
+    {
+      /* Update the minimum value and its index */
+      out = minVal2;
+      outIndex = count + 2U;
+    }
+
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+    minVal2 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and its index */
+      out = minVal1;
+      outIndex = count + 3U;
+    }
+
+    /* compare for the minimum value */
+    if (out > minVal2)
+    {
+      /* Update the minimum value and its index */
+      out = minVal2;
+      outIndex = count + 4U;
+    }
+
+    count += 4U;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* if (blockSize - 1U) is not multiple of 4 */
+  blkCnt = (blockSize - 1U) % 4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  q7_t minVal1, out;                             /* Temporary variables to store the output value. */
+  uint32_t blkCnt, outIndex;                     /* loop counter */
+
+  /* Initialise the index value to zero. */
+  outIndex = 0U;
+  /* Load first input value that act as reference value for comparision */
+  out = *pSrc++;
+
+  blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* Initialize minVal to the next consecutive values one by one */
+    minVal1 = *pSrc++;
+
+    /* compare for the minimum value */
+    if (out > minVal1)
+    {
+      /* Update the minimum value and it's index */
+      out = minVal1;
+      outIndex = blockSize - blkCnt;
+    }
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Store the minimum value and it's index into destination pointers */
+  *pResult = out;
+  *pIndex = outIndex;
+}
+
+/**
+ * @} end of Min group
+ */
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_power_f32.c
+ * Description:  Sum of the squares of the elements of a floating-point vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup power Power
+ *
+ * Calculates the sum of the squares of the elements in the input vector.
+ * The underlying algorithm is used:
+ *
+ * <pre>
+ * 	Result = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + pSrc[2] * pSrc[2] + ... + pSrc[blockSize-1] * pSrc[blockSize-1];
+ * </pre>
+ *
+ * There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ * @addtogroup power
+ * @{
+ */
+
+
+/**
+ * @brief Sum of the squares of the elements of a floating-point vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult sum of the squares value returned here
+ * @return none.
+ *
+ */
+
+
+void arm_power_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult)
+{
+  float32_t sum = 0.0f;                          /* accumulator */
+  float32_t in;                                  /* Temporary variable to store input value */
+  uint32_t blkCnt;                               /* loop counter */
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* Compute Power and then store the result in a temporary variable, sum. */
+    in = *pSrc++;
+    sum += in * in;
+    in = *pSrc++;
+    sum += in * in;
+    in = *pSrc++;
+    sum += in * in;
+    in = *pSrc++;
+    sum += in * in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* compute power and then store the result in a temporary variable, sum. */
+    in = *pSrc++;
+    sum += in * in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Store the result to the destination */
+  *pResult = sum;
+}
+
+/**
+ * @} end of power group
+ */
@@ -0,0 +1,138 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_power_q15.c
+ * Description:  Sum of the squares of the elements of a Q15 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup power
+ * @{
+ */
+
+/**
+ * @brief Sum of the squares of the elements of a Q15 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult sum of the squares value returned here
+ * @return none.
+ *
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.15 format.
+ * Intermediate multiplication yields a 2.30 format, and this
+ * result is added without saturation to a 64-bit accumulator in 34.30 format.
+ * With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the return result is in 34.30 format.
+ *
+ */
+
+void arm_power_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q63_t * pResult)
+{
+  q63_t sum = 0;                                 /* Temporary result storage */
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q31_t in32;                                    /* Temporary variable to store input value */
+  q15_t in16;                                    /* Temporary variable to store input value */
+  uint32_t blkCnt;                               /* loop counter */
+
+
+  /* loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* Compute Power and then store the result in a temporary variable, sum. */
+    in32 = *__SIMD32(pSrc)++;
+    sum = __SMLALD(in32, in32, sum);
+    in32 = *__SIMD32(pSrc)++;
+    sum = __SMLALD(in32, in32, sum);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* Compute Power and then store the result in a temporary variable, sum. */
+    in16 = *pSrc++;
+    sum = __SMLALD(in16, in16, sum);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  q15_t in;                                      /* Temporary variable to store input value */
+  uint32_t blkCnt;                               /* loop counter */
+
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* Compute Power and then store the result in a temporary variable, sum. */
+    in = *pSrc++;
+    sum += ((q31_t) in * in);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  /* Store the results in 34.30 format  */
+  *pResult = sum;
+}
+
+/**
+ * @} end of power group
+ */
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_power_q31.c
+ * Description:  Sum of the squares of the elements of a Q31 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup power
+ * @{
+ */
+
+/**
+ * @brief Sum of the squares of the elements of a Q31 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult sum of the squares value returned here
+ * @return none.
+ *
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.31 format.
+ * Intermediate multiplication yields a 2.62 format, and this
+ * result is truncated to 2.48 format by discarding the lower 14 bits.
+ * The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
+ * With 15 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the return result is in 16.48 format.
+ *
+ */
+
+void arm_power_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q63_t * pResult)
+{
+  q63_t sum = 0;                                 /* Temporary result storage */
+  q31_t in;
+  uint32_t blkCnt;                               /* loop counter */
+
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and then store the result in a temporary variable sum, providing 15 guard bits. */
+    in = *pSrc++;
+    sum += ((q63_t) in * in) >> 14U;
+
+    in = *pSrc++;
+    sum += ((q63_t) in * in) >> 14U;
+
+    in = *pSrc++;
+    sum += ((q63_t) in * in) >> 14U;
+
+    in = *pSrc++;
+    sum += ((q63_t) in * in) >> 14U;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* Compute Power and then store the result in a temporary variable, sum. */
+    in = *pSrc++;
+    sum += ((q63_t) in * in) >> 14U;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Store the results in 16.48 format  */
+  *pResult = sum;
+}
+
+/**
+ * @} end of power group
+ */
@@ -0,0 +1,127 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_power_q7.c
+ * Description:  Sum of the squares of the elements of a Q7 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup power
+ * @{
+ */
+
+/**
+ * @brief Sum of the squares of the elements of a Q7 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult sum of the squares value returned here
+ * @return none.
+ *
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * The input is represented in 1.7 format.
+ * Intermediate multiplication yields a 2.14 format, and this
+ * result is added without saturation to an accumulator in 18.14 format.
+ * With 17 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the return result is in 18.14 format.
+ *
+ */
+
+void arm_power_q7(
+  q7_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult)
+{
+  q31_t sum = 0;                                 /* Temporary result storage */
+  q7_t in;                                       /* Temporary variable to store input */
+  uint32_t blkCnt;                               /* loop counter */
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q31_t input1;                                  /* Temporary variable to store packed input */
+  q31_t in1, in2;                                /* Temporary variables to store input */
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* Reading two inputs of pSrc vector and packing */
+    input1 = *__SIMD32(pSrc)++;
+
+    in1 = __SXTB16(__ROR(input1, 8));
+    in2 = __SXTB16(input1);
+
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* calculate power and accumulate to accumulator */
+    sum = __SMLAD(in1, in1, sum);
+    sum = __SMLAD(in2, in2, sum);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* Compute Power and then store the result in a temporary variable, sum. */
+    in = *pSrc++;
+    sum += ((q15_t) in * in);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Store the result in 18.14 format  */
+  *pResult = sum;
+}
+
+/**
+ * @} end of power group
+ */
@@ -0,0 +1,127 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rms_f32.c
+ * Description:  Root mean square value of an array of F32 type
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup RMS Root mean square (RMS)
+ *
+ *
+ * Calculates the Root Mean Sqaure of the elements in the input vector.
+ * The underlying algorithm is used:
+ *
+ * <pre>
+ * 	Result = sqrt(((pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]) / blockSize));
+ * </pre>
+ *
+ * There are separate functions for floating point, Q31, and Q15 data types.
+ */
+
+/**
+ * @addtogroup RMS
+ * @{
+ */
+
+
+/**
+ * @brief Root Mean Square of the elements of a floating-point vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult rms value returned here
+ * @return none.
+ *
+ */
+
+void arm_rms_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult)
+{
+  float32_t sum = 0.0f;                          /* Accumulator */
+  float32_t in;                                  /* Tempoprary variable to store input value */
+  uint32_t blkCnt;                               /* loop counter */
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  /* loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* Compute sum of the squares and then store the result in a temporary variable, sum  */
+    in = *pSrc++;
+    sum += in * in;
+    in = *pSrc++;
+    sum += in * in;
+    in = *pSrc++;
+    sum += in * in;
+    in = *pSrc++;
+    sum += in * in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* Compute sum of the squares and then store the results in a temporary variable, sum  */
+    in = *pSrc++;
+    sum += in * in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Compute Rms and store the result in the destination */
+  arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
+}
+
+/**
+ * @} end of RMS group
+ */
@@ -0,0 +1,139 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rms_q15.c
+ * Description:  Root Mean Square of the elements of a Q15 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @addtogroup RMS
+ * @{
+ */
+
+/**
+ * @brief Root Mean Square of the elements of a Q15 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult rms value returned here
+ * @return none.
+ *
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.15 format.
+ * Intermediate multiplication yields a 2.30 format, and this
+ * result is added without saturation to a 64-bit accumulator in 34.30 format.
+ * With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ * 15 bits, and then saturated to yield a result in 1.15 format.
+ *
+ */
+
+void arm_rms_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult)
+{
+  q63_t sum = 0;                                 /* accumulator */
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q31_t in;                                      /* temporary variable to store the input value */
+  q15_t in1;                                     /* temporary variable to store the input value */
+  uint32_t blkCnt;                               /* loop counter */
+
+  /* loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute sum of the squares and then store the results in a temporary variable, sum */
+    in = *__SIMD32(pSrc)++;
+    sum = __SMLALD(in, in, sum);
+    in = *__SIMD32(pSrc)++;
+    sum = __SMLALD(in, in, sum);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute sum of the squares and then store the results in a temporary variable, sum */
+    in1 = *pSrc++;
+    sum = __SMLALD(in1, in1, sum);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Truncating and saturating the accumulator to 1.15 format */
+  /* Store the result in the destination */
+  arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  q15_t in;                                      /* temporary variable to store the input value */
+  uint32_t blkCnt;                               /* loop counter */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute sum of the squares and then store the results in a temporary variable, sum */
+    in = *pSrc++;
+    sum += ((q31_t) in * in);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Truncating and saturating the accumulator to 1.15 format */
+  /* Store the result in the destination */
+  arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+/**
+ * @} end of RMS group
+ */
@@ -0,0 +1,137 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_rms_q31.c
+ * Description:  Root Mean Square of the elements of a Q31 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @addtogroup RMS
+ * @{
+ */
+
+
+/**
+ * @brief Root Mean Square of the elements of a Q31 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult rms value returned here
+ * @return none.
+ *
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *
+ *\par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The input is represented in 1.31 format, and intermediate multiplication
+ * yields a 2.62 format.
+ * The accumulator maintains full precision of the intermediate multiplication results,
+ * but provides only a single guard bit.
+ * There is no saturation on intermediate additions.
+ * If the accumulator overflows, it wraps around and distorts the result.
+ * In order to avoid overflows completely, the input signal must be scaled down by
+ * log2(blockSize) bits, as a total of blockSize additions are performed internally.
+ * Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
+ *
+ */
+
+void arm_rms_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult)
+{
+  q63_t sum = 0;                                 /* accumulator */
+  q31_t in;                                      /* Temporary variable to store the input */
+  uint32_t blkCnt;                               /* loop counter */
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  q31_t in1, in2, in3, in4;                      /* Temporary input variables */
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 8 outputs at a time.
+   ** a second loop below computes the remaining 1 to 7 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* Compute sum of the squares and then store the result in a temporary variable, sum */
+    /* read two samples from source buffer */
+    in1 = pSrc[0];
+    in2 = pSrc[1];
+
+    /* calculate power and accumulate to accumulator */
+    sum += (q63_t) in1 *in1;
+    sum += (q63_t) in2 *in2;
+
+    /* read two samples from source buffer */
+    in3 = pSrc[2];
+    in4 = pSrc[3];
+
+    /* calculate power and accumulate to accumulator */
+    sum += (q63_t) in3 *in3;
+    sum += (q63_t) in4 *in4;
+
+
+    /* update source buffer to process next samples */
+    pSrc += 4U;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  while (blkCnt > 0U)
+  {
+    /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+    /* Compute sum of the squares and then store the results in a temporary variable, sum */
+    in = *pSrc++;
+    sum += (q63_t) in *in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Convert data in 2.62 to 1.31 by 31 right shifts and saturate */
+  /* Compute Rms and store the result in the destination vector */
+  arm_sqrt_q31(clip_q63_to_q31((sum / (q63_t) blockSize) >> 31), pResult);
+}
+
+/**
+ * @} end of RMS group
+ */
@@ -0,0 +1,186 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_std_f32.c
+ * Description:  Standard deviation of the elements of a floating-point vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup STD Standard deviation
+ *
+ * Calculates the standard deviation of the elements in the input vector.
+ * The underlying algorithm is used:
+ *
+ * <pre>
+ *   Result = sqrt((sumOfSquares - sum<sup>2</sup> / blockSize) / (blockSize - 1))
+ *
+ *     where, sumOfSquares = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]
+ *
+ *                     sum = pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]
+ * </pre>
+ *
+ * There are separate functions for floating point, Q31, and Q15 data types.
+ */
+
+/**
+ * @addtogroup STD
+ * @{
+ */
+
+
+/**
+ * @brief Standard deviation of the elements of a floating-point vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult standard deviation value returned here
+ * @return none.
+ */
+
+void arm_std_f32(
+  float32_t * pSrc,
+  uint32_t blockSize,
+  float32_t * pResult)
+{
+  float32_t sum = 0.0f;                          /* Temporary result storage */
+  float32_t sumOfSquares = 0.0f;                 /* Sum of squares */
+  float32_t in;                                  /* input value */
+  uint32_t blkCnt;                               /* loop counter */
+#if defined (ARM_MATH_DSP)
+  float32_t meanOfSquares, mean, squareOfMean;   /* Temporary variables */
+#else
+  float32_t squareOfSum;                         /* Square of Sum */
+  float32_t var;                                 /* Temporary varaince storage */
+#endif
+
+  if (blockSize == 1U)
+  {
+    *pResult = 0;
+    return;
+  }
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1])  */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sum. */
+    in = *pSrc++;
+    sum += in;
+    sumOfSquares += in * in;
+    in = *pSrc++;
+    sum += in;
+    sumOfSquares += in * in;
+    in = *pSrc++;
+    sum += in;
+    sumOfSquares += in * in;
+    in = *pSrc++;
+    sum += in;
+    sumOfSquares += in * in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sum. */
+    in = *pSrc++;
+    sum += in;
+    sumOfSquares += in * in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Compute Mean of squares of the input samples
+   * and then store the result in a temporary variable, meanOfSquares. */
+  meanOfSquares = sumOfSquares / ((float32_t) blockSize - 1.0f);
+
+  /* Compute mean of all input values */
+  mean = sum / (float32_t) blockSize;
+
+  /* Compute square of mean */
+  squareOfMean = (mean * mean) * (((float32_t) blockSize) /
+                                  ((float32_t) blockSize - 1.0f));
+
+  /* Compute standard deviation and then store the result to the destination */
+  arm_sqrt_f32((meanOfSquares - squareOfMean), pResult);
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sumOfSquares. */
+    in = *pSrc++;
+    sumOfSquares += in * in;
+
+    /* C = (A[0] + A[1] + ... + A[blockSize-1]) */
+    /* Compute Sum of the input samples
+     * and then store the result in a temporary variable, sum. */
+    sum += in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Compute the square of sum */
+  squareOfSum = ((sum * sum) / (float32_t) blockSize);
+
+  /* Compute the variance */
+  var = ((sumOfSquares - squareOfSum) / (float32_t) (blockSize - 1.0f));
+
+  /* Compute standard deviation and then store the result to the destination */
+  arm_sqrt_f32(var, pResult);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+}
+
+/**
+ * @} end of STD group
+ */
@@ -0,0 +1,174 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_std_q15.c
+ * Description:  Standard deviation of an array of Q15 vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup STD
+ * @{
+ */
+
+/**
+ * @brief Standard deviation of the elements of a Q15 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult standard deviation value returned here
+ * @return none.
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.15 format.
+ * Intermediate multiplication yields a 2.30 format, and this
+ * result is added without saturation to a 64-bit accumulator in 34.30 format.
+ * With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ * 15 bits, and then saturated to yield a result in 1.15 format.
+ */
+
+void arm_std_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult)
+{
+  q31_t sum = 0;                                 /* Accumulator */
+  q31_t meanOfSquares, squareOfMean;             /* square of mean and mean of square */
+  uint32_t blkCnt;                               /* loop counter */
+  q63_t sumOfSquares = 0;                        /* Accumulator */
+#if defined (ARM_MATH_DSP)
+  q31_t in;                                      /* input value */
+  q15_t in1;                                     /* input value */
+#else
+  q15_t in;                                      /* input value */
+#endif
+
+  if (blockSize == 1U)
+  {
+    *pResult = 0;
+    return;
+  }
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1])  */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sum. */
+    in = *__SIMD32(pSrc)++;
+    sum += ((in << 16U) >> 16U);
+    sum +=  (in >> 16U);
+    sumOfSquares = __SMLALD(in, in, sumOfSquares);
+    in = *__SIMD32(pSrc)++;
+    sum += ((in << 16U) >> 16U);
+    sum +=  (in >> 16U);
+    sumOfSquares = __SMLALD(in, in, sumOfSquares);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sum. */
+    in1 = *pSrc++;
+    sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
+    sum += in1;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Compute Mean of squares of the input samples
+   * and then store the result in a temporary variable, meanOfSquares. */
+  meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
+
+  /* Compute square of mean */
+  squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+
+  /* mean of the squares minus the square of the mean. */
+  /* Compute standard deviation and store the result to the destination */
+  arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult);
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sumOfSquares. */
+    in = *pSrc++;
+    sumOfSquares += (in * in);
+
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+    sum += in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Compute Mean of squares of the input samples
+   * and then store the result in a temporary variable, meanOfSquares. */
+  meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
+
+  /* Compute square of mean */
+  squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+
+  /* mean of the squares minus the square of the mean. */
+  /* Compute standard deviation and store the result to the destination */
+  arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+}
+
+/**
+ * @} end of STD group
+ */
@@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_std_q31.c
+ * Description:  Standard deviation of an array of Q31 type.
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup STD
+ * @{
+ */
+
+/**
+ * @brief Standard deviation of the elements of a Q31 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult standard deviation value returned here
+ * @return none.
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *
+ *\par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The input is represented in 1.31 format, which is then downshifted by 8 bits
+ * which yields 1.23, and intermediate multiplication yields a 2.46 format.
+ * The accumulator maintains full precision of the intermediate multiplication results,
+ * but provides only a 16 guard bits.
+ * There is no saturation on intermediate additions.
+ * If the accumulator overflows it wraps around and distorts the result.
+ * In order to avoid overflows completely the input signal must be scaled down by
+ * log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
+ * After division, internal variables should be Q18.46
+ * Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
+ *
+ */
+
+void arm_std_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult)
+{
+  q63_t sum = 0;                                 /* Accumulator */
+  q63_t meanOfSquares, squareOfMean;             /* square of mean and mean of square */
+  q31_t in;                                      /* input value */
+  uint32_t blkCnt;                               /* loop counter */
+  q63_t sumOfSquares = 0;                        /* Accumulator */
+
+  if (blockSize == 1U)
+  {
+    *pResult = 0;
+    return;
+  }
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1])  */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sum. */
+    in = *pSrc++ >> 8U;
+    sum += in;
+    sumOfSquares += ((q63_t) (in) * (in));
+    in = *pSrc++ >> 8U;
+    sum += in;
+    sumOfSquares += ((q63_t) (in) * (in));
+    in = *pSrc++ >> 8U;
+    sum += in;
+    sumOfSquares += ((q63_t) (in) * (in));
+    in = *pSrc++ >> 8U;
+    sum += in;
+    sumOfSquares += ((q63_t) (in) * (in));
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sum. */
+    in = *pSrc++ >> 8U;
+    sum += in;
+    sumOfSquares += ((q63_t) (in) * (in));
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Compute Mean of squares of the input samples
+   * and then store the result in a temporary variable, meanOfSquares. */
+  meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sumOfSquares. */
+    in = *pSrc++ >> 8U;
+    sumOfSquares += ((q63_t) (in) * (in));
+
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+    sum += in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Compute Mean of squares of the input samples
+   * and then store the result in a temporary variable, meanOfSquares. */
+  meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  /* Compute square of mean */
+  squareOfMean = sum * sum / (q63_t)(blockSize * (blockSize - 1U));
+
+  /* Compute standard deviation and then store the result to the destination */
+  arm_sqrt_q31((meanOfSquares - squareOfMean) >> 15U, pResult);
+}
+
+/**
+ * @} end of STD group
+ */
@@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_var_f32.c
+ * Description:  Variance of the elements of a floating-point vector
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup variance  Variance
+ *
+ * Calculates the variance of the elements in the input vector.
+ * The underlying algorithm used is the direct method sometimes referred to as the two-pass method:
+ *
+ * <pre>
+ *   Result = sum(element - meanOfElements)^2) / numElement - 1
+ *
+ *     where, meanOfElements = ( pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] ) / blockSize
+ *
+ * </pre>
+ *
+ * There are separate functions for floating point, Q31, and Q15 data types.
+ */
+
+/**
+ * @addtogroup variance
+ * @{
+ */
+
+
+/**
+ * @brief Variance of the elements of a floating-point vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult variance value returned here
+ * @return none.
+ */
+
+void arm_var_f32(
+                 float32_t * pSrc,
+                 uint32_t blockSize,
+                 float32_t * pResult)
+{
+    float32_t fMean, fValue;
+    uint32_t blkCnt;            /* loop counter */
+    float32_t * pInput = pSrc;
+    float32_t sum = 0.0f;
+    float32_t fSum = 0.0f;
+    #if defined(ARM_MATH_DSP)
+    float32_t in1, in2, in3, in4;
+    #endif
+
+    if (blockSize <= 1U)
+    {
+        *pResult = 0;
+        return;
+    }
+
+    #if defined(ARM_MATH_DSP)
+        /* Run the below code for Cortex-M4 and Cortex-M7 */
+
+        /*loop Unrolling */
+        blkCnt = blockSize >> 2U;
+
+        /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+        ** a second loop below computes the remaining 1 to 3 samples. */
+        while (blkCnt > 0U)
+        {
+            /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+            in1 = *pInput++;
+            in2 = *pInput++;
+            in3 = *pInput++;
+            in4 = *pInput++;
+
+            sum += in1;
+            sum += in2;
+            sum += in3;
+            sum += in4;
+
+            /* Decrement the loop counter */
+            blkCnt--;
+        }
+
+        /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+        ** No loop unrolling is used. */
+        blkCnt = blockSize % 0x4U;
+
+    #else
+        /* Run the below code for Cortex-M0 or Cortex-M3 */
+
+        /* Loop over blockSize number of values */
+        blkCnt = blockSize;
+
+    #endif
+
+    while (blkCnt > 0U)
+    {
+        /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+        sum += *pInput++;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize  */
+    fMean = sum / (float32_t) blockSize;
+
+    pInput = pSrc;
+
+    #if defined(ARM_MATH_DSP)
+
+        /*loop Unrolling */
+        blkCnt = blockSize >> 2U;
+
+        /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+        ** a second loop below computes the remaining 1 to 3 samples. */
+        while (blkCnt > 0U)
+        {
+            fValue = *pInput++ - fMean;
+            fSum += fValue * fValue;
+            fValue = *pInput++ - fMean;
+            fSum += fValue * fValue;
+            fValue = *pInput++ - fMean;
+            fSum += fValue * fValue;
+            fValue = *pInput++ - fMean;
+            fSum += fValue * fValue;
+
+            /* Decrement the loop counter */
+            blkCnt--;
+        }
+
+        blkCnt = blockSize % 0x4U;
+    #else
+        /* Run the below code for Cortex-M0 or Cortex-M3 */
+
+        /* Loop over blockSize number of values */
+        blkCnt = blockSize;
+    #endif
+
+    while (blkCnt > 0U)
+    {
+        fValue = *pInput++ - fMean;
+        fSum += fValue * fValue;
+
+        /* Decrement the loop counter */
+        blkCnt--;
+    }
+
+    /* Variance */
+    *pResult = fSum / (float32_t)(blockSize - 1.0f);
+}
+
+/**
+ * @} end of variance group
+ */
@@ -0,0 +1,172 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_var_q15.c
+ * Description:  Variance of an array of Q15 type
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup variance
+ * @{
+ */
+
+/**
+ * @brief Variance of the elements of a Q15 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult variance value returned here
+ * @return none.
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.15 format.
+ * Intermediate multiplication yields a 2.30 format, and this
+ * result is added without saturation to a 64-bit accumulator in 34.30 format.
+ * With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ * 15 bits, and then saturated to yield a result in 1.15 format.
+ */
+
+void arm_var_q15(
+  q15_t * pSrc,
+  uint32_t blockSize,
+  q15_t * pResult)
+{
+  q31_t sum = 0;                                 /* Accumulator */
+  q31_t meanOfSquares, squareOfMean;             /* square of mean and mean of square */
+  uint32_t blkCnt;                               /* loop counter */
+  q63_t sumOfSquares = 0;                        /* Accumulator */
+#if defined (ARM_MATH_DSP)
+  q31_t in;                                      /* input value */
+  q15_t in1;                                     /* input value */
+#else
+  q15_t in;                                      /* input value */
+#endif
+
+  if (blockSize == 1U)
+  {
+    *pResult = 0;
+    return;
+  }
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1])  */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sum. */
+    in = *__SIMD32(pSrc)++;
+    sum += ((in << 16U) >> 16U);
+    sum +=  (in >> 16U);
+    sumOfSquares = __SMLALD(in, in, sumOfSquares);
+    in = *__SIMD32(pSrc)++;
+    sum += ((in << 16U) >> 16U);
+    sum +=  (in >> 16U);
+    sumOfSquares = __SMLALD(in, in, sumOfSquares);
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sum. */
+    in1 = *pSrc++;
+    sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
+    sum += in1;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Compute Mean of squares of the input samples
+   * and then store the result in a temporary variable, meanOfSquares. */
+  meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
+
+  /* Compute square of mean */
+  squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+
+  /* mean of the squares minus the square of the mean. */
+  *pResult = (meanOfSquares - squareOfMean) >> 15U;
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sumOfSquares. */
+    in = *pSrc++;
+    sumOfSquares += (in * in);
+
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+    sum += in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Compute Mean of squares of the input samples
+   * and then store the result in a temporary variable, meanOfSquares. */
+  meanOfSquares = (q31_t)(sumOfSquares / (q63_t)(blockSize - 1U));
+
+  /* Compute square of mean */
+  squareOfMean = (q31_t)((q63_t)sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+
+  /* mean of the squares minus the square of the mean. */
+  *pResult = (meanOfSquares - squareOfMean) >> 15;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+}
+
+/**
+ * @} end of variance group
+ */
@@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS DSP Library
+ * Title:        arm_var_q31.c
+ * Description:  Variance of an array of Q31 type
+ *
+ * $Date:        27. January 2017
+ * $Revision:    V.1.5.1
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2017 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup variance
+ * @{
+ */
+
+/**
+ * @brief Variance of the elements of a Q31 vector.
+ * @param[in]       *pSrc points to the input vector
+ * @param[in]       blockSize length of the input vector
+ * @param[out]      *pResult variance value returned here
+ * @return none.
+ * @details
+ * <b>Scaling and Overflow Behavior:</b>
+ *
+ *\par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The input is represented in 1.31 format, which is then downshifted by 8 bits
+ * which yields 1.23, and intermediate multiplication yields a 2.46 format.
+ * The accumulator maintains full precision of the intermediate multiplication results,
+ * but provides only a 16 guard bits.
+ * There is no saturation on intermediate additions.
+ * If the accumulator overflows it wraps around and distorts the result.
+ * In order to avoid overflows completely the input signal must be scaled down by
+ * log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
+ * After division, internal variables should be Q18.46
+ * Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
+ *
+ */
+
+void arm_var_q31(
+  q31_t * pSrc,
+  uint32_t blockSize,
+  q31_t * pResult)
+{
+  q63_t sum = 0;                                 /* Accumulator */
+  q63_t meanOfSquares, squareOfMean;             /* square of mean and mean of square */
+  q31_t in;                                      /* input value */
+  uint32_t blkCnt;                               /* loop counter */
+  q63_t sumOfSquares = 0;                        /* Accumulator */
+
+  if (blockSize == 1U)
+  {
+    *pResult = 0;
+    return;
+  }
+
+#if defined (ARM_MATH_DSP)
+  /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+  /*loop Unrolling */
+  blkCnt = blockSize >> 2U;
+
+  /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
+   ** a second loop below computes the remaining 1 to 3 samples. */
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1])  */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sum. */
+    in = *pSrc++ >> 8U;
+    sum += in;
+    sumOfSquares += ((q63_t) (in) * (in));
+    in = *pSrc++ >> 8U;
+    sum += in;
+    sumOfSquares += ((q63_t) (in) * (in));
+    in = *pSrc++ >> 8U;
+    sum += in;
+    sumOfSquares += ((q63_t) (in) * (in));
+    in = *pSrc++ >> 8U;
+    sum += in;
+    sumOfSquares += ((q63_t) (in) * (in));
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+   ** No loop unrolling is used. */
+  blkCnt = blockSize % 0x4U;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sum. */
+    in = *pSrc++ >> 8U;
+    sum += in;
+    sumOfSquares += ((q63_t) (in) * (in));
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Compute Mean of squares of the input samples
+   * and then store the result in a temporary variable, meanOfSquares. */
+  meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
+
+#else
+  /* Run the below code for Cortex-M0 */
+
+  /* Loop over blockSize number of values */
+  blkCnt = blockSize;
+
+  while (blkCnt > 0U)
+  {
+    /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+    /* Compute Sum of squares of the input samples
+     * and then store the result in a temporary variable, sumOfSquares. */
+    in = *pSrc++ >> 8U;
+    sumOfSquares += ((q63_t) (in) * (in));
+
+    /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+    /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+    sum += in;
+
+    /* Decrement the loop counter */
+    blkCnt--;
+  }
+
+  /* Compute Mean of squares of the input samples
+   * and then store the result in a temporary variable, meanOfSquares. */
+  meanOfSquares = sumOfSquares / (q63_t)(blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+  /* Compute square of mean */
+  squareOfMean = sum * sum / (q63_t)(blockSize * (blockSize - 1U));
+
+  /* Compute standard deviation and then store the result to the destination */
+  *pResult = (meanOfSquares - squareOfMean) >> 15U;
+}
+
+/**
+ * @} end of variance group
+ */