lib/crc/riscv/crc-clmul-template.h - linux - Git at Google

 /* SPDX-License-Identifier: GPL-2.0-or-later */
 /* Copyright 2025 Google LLC */

 /*
  * This file is a "template" that generates a CRC function optimized using the
  * RISC-V Zbc (scalar carryless multiplication) extension.  The includer of this
  * file must define the following parameters to specify the type of CRC:
  *
  *	crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
  *	LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
  *		 mapping between bits and polynomial coefficients
  *	         1 for a lsb (least-significant-bit) first CRC, i.e. reflected
  *	         mapping between bits and polynomial coefficients
  */

 #include <asm/byteorder.h>
 #include <linux/minmax.h>

 #define CRC_BITS	(8 * sizeof(crc_t))	/* a.k.a. 'n' */

 static inline unsigned long clmul(unsigned long a, unsigned long b)
 {
 	unsigned long res;

 	asm(".option push\n"
 	    ".option arch,+zbc\n"
 	    "clmul %0, %1, %2\n"
 	    ".option pop\n"
 	    : "=r" (res) : "r" (a), "r" (b));
 	return res;
 }

 static inline unsigned long clmulh(unsigned long a, unsigned long b)
 {
 	unsigned long res;

 	asm(".option push\n"
 	    ".option arch,+zbc\n"
 	    "clmulh %0, %1, %2\n"
 	    ".option pop\n"
 	    : "=r" (res) : "r" (a), "r" (b));
 	return res;
 }

 static inline unsigned long clmulr(unsigned long a, unsigned long b)
 {
 	unsigned long res;

 	asm(".option push\n"
 	    ".option arch,+zbc\n"
 	    "clmulr %0, %1, %2\n"
 	    ".option pop\n"
 	    : "=r" (res) : "r" (a), "r" (b));
 	return res;
 }

 /*
  * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
  * polynomial whose bit order matches the CRC's bit order.
  */
 #ifdef CONFIG_64BIT
 #  if LSB_CRC
 #    define crc_load_long(x)	le64_to_cpup(x)
 #  else
 #    define crc_load_long(x)	be64_to_cpup(x)
 #  endif
 #else
 #  if LSB_CRC
 #    define crc_load_long(x)	le32_to_cpup(x)
 #  else
 #    define crc_load_long(x)	be32_to_cpup(x)
 #  endif
 #endif

 /* XOR @crc into the end of @msgpoly that represents the high-order terms. */
 static inline unsigned long
 crc_clmul_prep(crc_t crc, unsigned long msgpoly)
 {
 #if LSB_CRC
 	return msgpoly ^ crc;
 #else
 	return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
 #endif
 }

 /*
  * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
  * modulo the generator polynomial G.  This gives the CRC of @msgpoly.
  */
 static inline crc_t
 crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
 {
 	unsigned long tmp;

 	/*
 	 * First step of Barrett reduction with integrated multiplication by
 	 * x^n: calculate floor((msgpoly * x^n) / G).  This is the value by
 	 * which G needs to be multiplied to cancel out the x^n and higher terms
 	 * of msgpoly * x^n.  Do it using the following formula:
 	 *
 	 * msb-first:
 	 *    floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
 	 * lsb-first:
 	 *    floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
 	 *
 	 * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
 	 * which fits a long exactly.  Using any lower power of x there would
 	 * not carry enough precision through the calculation, while using any
 	 * higher power of x would require extra instructions to handle a wider
 	 * multiplication.  In the msb-first case, using this power of x results
 	 * in needing a floored division by x^(BITS_PER_LONG-1), which matches
 	 * what clmulr produces.  In the lsb-first case, a factor of x gets
 	 * implicitly introduced by each carryless multiplication (shown as
 	 * '* x' above), and the floored division instead needs to be by
 	 * x^BITS_PER_LONG which matches what clmul produces.
 	 */
 #if LSB_CRC
 	tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
 #else
 	tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);
 #endif

 	/*
 	 * Second step of Barrett reduction:
 	 *
 	 *    crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
 	 *
 	 * This reduces (msgpoly * x^n) modulo G by adding the appropriate
 	 * multiple of G to it.  The result uses only the x^0..x^(n-1) terms.
 	 * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
 	 * terms in the first place, it is more efficient to do the equivalent:
 	 *
 	 *    crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
 	 *
 	 * In the lsb-first case further modify it to the following which avoids
 	 * a shift, as the crc ends up in the physically low n bits from clmulr:
 	 *
 	 *    product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
 	 *    crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
 	 *
 	 * barrett_reduction_const_2 contains the constant multiplier (G - x^n)
 	 * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above.  The
 	 * cast of the result to crc_t is essential, as it applies the mod x^n!
 	 */
 #if LSB_CRC
 	return clmulr(tmp, consts->barrett_reduction_const_2);
 #else
 	return clmul(tmp, consts->barrett_reduction_const_2);
 #endif
 }

 /* Update @crc with the data from @msgpoly. */
 static inline crc_t
 crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
 		      const struct crc_clmul_consts *consts)
 {
 	return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);
 }

 /* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */
 static inline crc_t
 crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
 			 const struct crc_clmul_consts *consts)
 {
 	unsigned long msgpoly;
 	size_t i;

 #if LSB_CRC
 	msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);
 	for (i = 1; i < len; i++)
 		msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));
 #else
 	msgpoly = p[0];
 	for (i = 1; i < len; i++)
 		msgpoly = (msgpoly << 8) ^ p[i];
 #endif

 	if (len >= sizeof(crc_t)) {
 	#if LSB_CRC
 		msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
 	#else
 		msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);
 	#endif
 		return crc_clmul_long(msgpoly, consts);
 	}
 #if LSB_CRC
 	msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
 	return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));
 #else
 	msgpoly ^= crc >> (CRC_BITS - 8*len);
 	return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));
 #endif
 }

 static inline crc_t
 crc_clmul(crc_t crc, const void *p, size_t len,
 	  const struct crc_clmul_consts *consts)
 {
 	size_t align;

 	/* This implementation assumes that the CRC fits in an unsigned long. */
 	BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));

 	/* If the buffer is not long-aligned, align it. */
 	align = (unsigned long)p % sizeof(unsigned long);
 	if (align && len) {
 		align = min(sizeof(unsigned long) - align, len);
 		crc = crc_clmul_update_partial(crc, p, align, consts);
 		p += align;
 		len -= align;
 	}

 	if (len >= 4 * sizeof(unsigned long)) {
 		unsigned long m0, m1;

 		m0 = crc_clmul_prep(crc, crc_load_long(p));
 		m1 = crc_load_long(p + sizeof(unsigned long));
 		p += 2 * sizeof(unsigned long);
 		len -= 2 * sizeof(unsigned long);
 		/*
 		 * Main loop.  Each iteration starts with a message polynomial
 		 * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
 		 * more longs of data to form x^(3*BITS_PER_LONG)*m0 +
 		 * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then
 		 * "folds" that back into a congruent (modulo G) value that uses
 		 * just m0 and m1 again.  This is done by multiplying m0 by the
 		 * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
 		 * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
 		 * adding the results to m2 and m3 as appropriate.  Each such
 		 * multiplication produces a result twice the length of a long,
 		 * which in RISC-V is two instructions clmul and clmulh.
 		 *
 		 * This could be changed to fold across more than 2 longs at a
 		 * time if there is a CPU that can take advantage of it.
 		 */
 		do {
 			unsigned long p0, p1, p2, p3;

 			p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);
 			p1 = clmul(m0, consts->fold_across_2_longs_const_hi);
 			p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);
 			p3 = clmul(m1, consts->fold_across_2_longs_const_lo);
 			m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
 			m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
 			     crc_load_long(p + sizeof(unsigned long));

 			p += 2 * sizeof(unsigned long);
 			len -= 2 * sizeof(unsigned long);
 		} while (len >= 2 * sizeof(unsigned long));

 		crc = crc_clmul_long(m0, consts);
 		crc = crc_clmul_update_long(crc, m1, consts);
 	}

 	while (len >= sizeof(unsigned long)) {
 		crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
 		p += sizeof(unsigned long);
 		len -= sizeof(unsigned long);
 	}

 	if (len)
 		crc = crc_clmul_update_partial(crc, p, len, consts);

 	return crc;
 }
	/* SPDX-License-Identifier: GPL-2.0-or-later */
	/* Copyright 2025 Google LLC */

	/*
	* This file is a "template" that generates a CRC function optimized using the
	* RISC-V Zbc (scalar carryless multiplication) extension. The includer of this
	* file must define the following parameters to specify the type of CRC:
	*
	* crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
	* LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
	* mapping between bits and polynomial coefficients
	* 1 for a lsb (least-significant-bit) first CRC, i.e. reflected
	* mapping between bits and polynomial coefficients
	*/

	#include <asm/byteorder.h>
	#include <linux/minmax.h>

	#define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */

	static inline unsigned long clmul(unsigned long a, unsigned long b)
	{
	unsigned long res;

	asm(".option push\n"
	".option arch,+zbc\n"
	"clmul %0, %1, %2\n"
	".option pop\n"
	: "=r" (res) : "r" (a), "r" (b));
	return res;
	}

	static inline unsigned long clmulh(unsigned long a, unsigned long b)
	{
	unsigned long res;

	asm(".option push\n"
	".option arch,+zbc\n"
	"clmulh %0, %1, %2\n"
	".option pop\n"
	: "=r" (res) : "r" (a), "r" (b));
	return res;
	}

	static inline unsigned long clmulr(unsigned long a, unsigned long b)
	{
	unsigned long res;

	asm(".option push\n"
	".option arch,+zbc\n"
	"clmulr %0, %1, %2\n"
	".option pop\n"
	: "=r" (res) : "r" (a), "r" (b));
	return res;
	}

	/*
	* crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
	* polynomial whose bit order matches the CRC's bit order.
	*/
	#ifdef CONFIG_64BIT
	# if LSB_CRC
	# define crc_load_long(x) le64_to_cpup(x)
	# else
	# define crc_load_long(x) be64_to_cpup(x)
	# endif
	#else
	# if LSB_CRC
	# define crc_load_long(x) le32_to_cpup(x)
	# else
	# define crc_load_long(x) be32_to_cpup(x)
	# endif
	#endif

	/* XOR @crc into the end of @msgpoly that represents the high-order terms. */
	static inline unsigned long
	crc_clmul_prep(crc_t crc, unsigned long msgpoly)
	{
	#if LSB_CRC
	return msgpoly ^ crc;
	#else
	return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
	#endif
	}

	/*
	* Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
	* modulo the generator polynomial G. This gives the CRC of @msgpoly.
	*/
	static inline crc_t
	crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
	{
	unsigned long tmp;

	/*
	* First step of Barrett reduction with integrated multiplication by
	* x^n: calculate floor((msgpoly * x^n) / G). This is the value by
	* which G needs to be multiplied to cancel out the x^n and higher terms
	* of msgpoly * x^n. Do it using the following formula:
	*
	* msb-first:
	* floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
	* lsb-first:
	* floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
	*
	* barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
	* which fits a long exactly. Using any lower power of x there would
	* not carry enough precision through the calculation, while using any
	* higher power of x would require extra instructions to handle a wider
	* multiplication. In the msb-first case, using this power of x results
	* in needing a floored division by x^(BITS_PER_LONG-1), which matches
	* what clmulr produces. In the lsb-first case, a factor of x gets
	* implicitly introduced by each carryless multiplication (shown as
	* '* x' above), and the floored division instead needs to be by
	* x^BITS_PER_LONG which matches what clmul produces.
	*/
	#if LSB_CRC
	tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
	#else
	tmp = clmulr(msgpoly, consts->barrett_reduction_const_1);
	#endif

	/*
	* Second step of Barrett reduction:
	*
	* crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
	*
	* This reduces (msgpoly * x^n) modulo G by adding the appropriate
	* multiple of G to it. The result uses only the x^0..x^(n-1) terms.
	* HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
	* terms in the first place, it is more efficient to do the equivalent:
	*
	* crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
	*
	* In the lsb-first case further modify it to the following which avoids
	* a shift, as the crc ends up in the physically low n bits from clmulr:
	*
	* product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
	* crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
	*
	* barrett_reduction_const_2 contains the constant multiplier (G - x^n)
	* or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The
	* cast of the result to crc_t is essential, as it applies the mod x^n!
	*/
	#if LSB_CRC
	return clmulr(tmp, consts->barrett_reduction_const_2);
	#else
	return clmul(tmp, consts->barrett_reduction_const_2);
	#endif
	}

	/* Update @crc with the data from @msgpoly. */
	static inline crc_t
	crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
	const struct crc_clmul_consts *consts)
	{
	return crc_clmul_long(crc_clmul_prep(crc, msgpoly), consts);
	}

	/* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */
	static inline crc_t
	crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
	const struct crc_clmul_consts *consts)
	{
	unsigned long msgpoly;
	size_t i;

	#if LSB_CRC
	msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8);
	for (i = 1; i < len; i++)
	msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8));
	#else
	msgpoly = p[0];
	for (i = 1; i < len; i++)
	msgpoly = (msgpoly << 8) ^ p[i];
	#endif

	if (len >= sizeof(crc_t)) {
	#if LSB_CRC
	msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
	#else
	msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS);
	#endif
	return crc_clmul_long(msgpoly, consts);
	}
	#if LSB_CRC
	msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len);
	return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len));
	#else
	msgpoly ^= crc >> (CRC_BITS - 8*len);
	return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len));
	#endif
	}

	static inline crc_t
	crc_clmul(crc_t crc, const void *p, size_t len,
	const struct crc_clmul_consts *consts)
	{
	size_t align;

	/* This implementation assumes that the CRC fits in an unsigned long. */
	BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));

	/* If the buffer is not long-aligned, align it. */
	align = (unsigned long)p % sizeof(unsigned long);
	if (align && len) {
	align = min(sizeof(unsigned long) - align, len);
	crc = crc_clmul_update_partial(crc, p, align, consts);
	p += align;
	len -= align;
	}

	if (len >= 4 * sizeof(unsigned long)) {
	unsigned long m0, m1;

	m0 = crc_clmul_prep(crc, crc_load_long(p));
	m1 = crc_load_long(p + sizeof(unsigned long));
	p += 2 * sizeof(unsigned long);
	len -= 2 * sizeof(unsigned long);
	/*
	* Main loop. Each iteration starts with a message polynomial
	* (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
	* more longs of data to form x^(3BITS_PER_LONG)m0 +
	* x^(2BITS_PER_LONG)m1 + x^BITS_PER_LONG*m2 + m3, then
	* "folds" that back into a congruent (modulo G) value that uses
	* just m0 and m1 again. This is done by multiplying m0 by the
	* precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
	* the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
	* adding the results to m2 and m3 as appropriate. Each such
	* multiplication produces a result twice the length of a long,
	* which in RISC-V is two instructions clmul and clmulh.
	*
	* This could be changed to fold across more than 2 longs at a
	* time if there is a CPU that can take advantage of it.
	*/
	do {
	unsigned long p0, p1, p2, p3;

	p0 = clmulh(m0, consts->fold_across_2_longs_const_hi);
	p1 = clmul(m0, consts->fold_across_2_longs_const_hi);
	p2 = clmulh(m1, consts->fold_across_2_longs_const_lo);
	p3 = clmul(m1, consts->fold_across_2_longs_const_lo);
	m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
	m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
	crc_load_long(p + sizeof(unsigned long));

	p += 2 * sizeof(unsigned long);
	len -= 2 * sizeof(unsigned long);
	} while (len >= 2 * sizeof(unsigned long));

	crc = crc_clmul_long(m0, consts);
	crc = crc_clmul_update_long(crc, m1, consts);
	}

	while (len >= sizeof(unsigned long)) {
	crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
	p += sizeof(unsigned long);
	len -= sizeof(unsigned long);
	}

	if (len)
	crc = crc_clmul_update_partial(crc, p, len, consts);

	return crc;
	}