/*=============================================================================

    This file is part of FLINT.

    FLINT is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    FLINT is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with FLINT; if not, write to the Free Software
    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA

=============================================================================*/
/******************************************************************************

    Copyright (C) 2010 William Hart
    Copyright (C) 2011 Fredrik Johansson
    Copyright (C) 2011 Sebastian Pancratz

******************************************************************************/

*******************************************************************************

    Memory management

*******************************************************************************

void nmod_poly_init(nmod_poly_t poly, mp_limb_t n)

    Initialises \code{poly}. It will have coefficients modulo~$n$.

void nmod_poly_init_preinv(nmod_poly_t poly, mp_limb_t n, mp_limb_t ninv)

    Initialises \code{poly}. It will have coefficients modulo~$n$.  
    The caller supplies a precomputed inverse limb generated by 
    \code{n_preinvert_limb()}.

void nmod_poly_init2(nmod_poly_t poly, mp_limb_t n, long alloc)

    Initialises \code{poly}. It will have coefficients modulo~$n$.
    Up to \code{alloc} coefficients may be stored in \code{poly}.

void nmod_poly_init2_preinv(nmod_poly_t poly, 
                       mp_limb_t n, mp_limb_t ninv, long alloc)

    Initialises \code{poly}. It will have coefficients modulo~$n$. 
    The caller supplies a precomputed inverse limb generated by 
    \code{n_preinvert_limb()}. Up to \code{alloc} coefficients may 
    be stored in \code{poly}.

void nmod_poly_realloc(nmod_poly_t poly, long alloc)

    Reallocates \code{poly} to the given length. If the current 
    length is less than \code{alloc}, the polynomial is truncated 
    and normalised.  If \code{alloc} is zero, the polynomial is 
    cleared.

void nmod_poly_clear(nmod_poly_t poly)

    Clears the polynomial and releases any memory it used. The polynomial 
    cannot be used again until it is initialised.

void nmod_poly_fit_length(nmod_poly_t poly, long alloc)

    Ensures \code{poly} has space for at least \code{alloc} coefficients. 
    This function only ever grows the allocated space, so no data loss can 
    occur.

void _nmod_poly_normalise(nmod_poly_t poly)

    Internal function for normalising a polynomial so that the top 
    coefficient, if there is one at all, is not zero.

*******************************************************************************

    Polynomial properties

*******************************************************************************

long nmod_poly_length(const nmod_poly_t poly)

    Returns the length of the polynomial \code{poly}. The zero polynomial
    has length zero.

long nmod_poly_degree(const nmod_poly_t poly)

    Returns the degree of the polynomial \code{poly}. The zero polynomial
    is deemed to have degree~$-1$.

mp_limb_t nmod_poly_modulus(const nmod_poly_t poly)

    Returns the modulus of the polynomial \code{poly}. This will be a 
    positive integer.

mp_bitcnt_t nmod_poly_max_bits(const nmod_poly_t poly)

    Returns the maximum number of bits of any coefficient of \code{poly}.

*******************************************************************************

    Assignment and basic manipulation

*******************************************************************************

void nmod_poly_set(nmod_poly_t a, const nmod_poly_t b)

    Sets \code{a} to a copy of \code{b}.

void nmod_poly_swap(nmod_poly_t poly1, nmod_poly_t poly2)

    Efficiently swaps \code{poly1} and \code{poly2} by swapping pointers 
    internally. 

void nmod_poly_zero(nmod_poly_t res)

    Sets \code{res} to the zero polynomial.

void nmod_poly_truncate(nmod_poly_t poly, long len)

    Truncates \code{poly} to the given length and normalises it. 
    If \code{len} is greater than the current length of \code{poly}, 
    then nothing happens.

void _nmod_poly_reverse(mp_ptr output, mp_srcptr input, long len, long m)

    Sets \code{output} to the reverse of \code{input}, which is of length
    \code{len}, but thinking of it as a polynomial of length~\code{m}, 
    notionally zero-padded if necessary. The length~\code{m} must be 
    non-negative, but there are no other restrictions. The polynomial
    \code{output} must have space for \code{m} coefficients. 

void nmod_poly_reverse(nmod_poly_t output, const nmod_poly_t input, long m)

    Sets \code{output} to the reverse of \code{input}, thinking of it as
    a polynomial of length~\code{m}, notionally zero-padded if necessary).
    The length~\code{m} must be non-negative, but there are no other
    restrictions. The output polynomial will be set to length~\code{m}
    and then normalised.

*******************************************************************************

    Randomisation

*******************************************************************************

void nmod_poly_randtest(nmod_poly_t poly, flint_rand_t state, long len)

    Generates a random polynomial with up to the given length.

*******************************************************************************

    Getting and setting coefficients

*******************************************************************************

ulong nmod_poly_get_coeff_ui(const nmod_poly_t poly, long j)

    Returns the coefficient of \code{poly} at index~\code{j}, where 
    coefficients are numbered with zero being the constant coefficient, 
    and returns it as an \code{unsigned long}. If \code{j} refers to a 
    coefficient beyond the end of \code{poly}, zero is returned.

void nmod_poly_set_coeff_ui(nmod_poly_t poly, long j, ulong c)

    Sets the coefficient of \code{poly} at index \code{j}, where 
    coefficients are numbered with zero being the constant coefficient, 
    to the value \code{c} reduced modulo the modulus of \code{poly}. 
    If \code{j} refers to a coefficient beyond the current end of \code{poly}, 
    the polynomial is first resized, with intervening coefficients being
    set to zero.

*******************************************************************************

    Input and output

*******************************************************************************

char * nmod_poly_get_str(const nmod_poly_t poly)

    Writes \code{poly} to a string representation. The format is as 
    described for \code{nmod_poly_print()}. The string must be freed by the 
    user when finished. For this it is sufficient to call \code{flint_free()}.

int nmod_poly_set_str(nmod_poly_t poly, const char * s)

    Reads \code{poly} from a string \code{s}. The format is as described
    for \code{nmod_poly_print()}. If a polynomial in the correct format
    is read, a positive value is returned, otherwise a non-positive value 
    is returned.

int nmod_poly_print(const nmod_poly_t a)

    Prints the polynomial to \code{stdout}. The length is printed, 
    followed by a space, then the modulus. If the length is zero this is
    all that is printed, otherwise two spaces followed by a space 
    separated list of coefficients is printed, beginning with the constant 
    coefficient.

    In case of success, returns a positive value.  In case of failure, 
    returns a non-positive value.

int nmod_poly_fread(FILE * f, nmod_poly_t poly)

    Reads \code{poly} from the file stream \code{f}. If this is a file
    that has just been written, the file should be closed then opened
    again. The format is as described for \code{nmod_poly_print()}. If a 
    polynomial in the correct format is read, a positive value is returned, 
    otherwise a non-positive value is returned.

int nmod_poly_fprint(FILE * f, const nmod_poly_t poly)

    Writes a polynomial to the file stream \code{f}. If this is a file
    then the file should be closed and reopened before being read.
    The format is as described for \code{nmod_poly_print()}. If a 
    polynomial in the correct format is read, a positive value is returned, 
    otherwise a non-positive value is returned. If an error occurs 
    whilst writing to the file, an error message is printed.

    In case of success, returns a positive value.  In case of failure, 
    returns a non-positive value.

int nmod_poly_read(nmod_poly_t poly)

    Read \code{poly} from \code{stdin}. The format is as described for 
    \code{nmod_poly_print()}. If a polynomial in the correct format is read, a 
    positive value is returned, otherwise a non-positive value is returned.

*******************************************************************************

    Comparison

*******************************************************************************

int nmod_poly_equal(const nmod_poly_t a, const nmod_poly_t b)

    Returns~$1$ if the polynomials are equal, otherwise~$0$.

int nmod_poly_is_zero(const nmod_poly_t poly)

    Returns~$1$ if the polynomial \code{poly} is the zero polynomial,
    otherwise returns~$0$.

int nmod_poly_is_one(const nmod_poly_t poly)

    Returns~$1$ if the polynomial \code{poly} is the constant polynomial 1,
    otherwise returns~$0$.

*******************************************************************************

    Shifting

*******************************************************************************

void _nmod_poly_shift_left(mp_ptr res, mp_srcptr poly, long len, long k)

    Sets \code{(res, len + k)} to \code{(poly, len)} shifted left by 
    \code{k} coefficients. Assumes that \code{res} has space for 
    \code{len + k} coefficients.

void nmod_poly_shift_left(nmod_poly_t res, const nmod_poly_t poly, long k)

    Sets \code{res} to \code{poly} shifted left by \code{k} coefficients, 
    i.e.\ multiplied by $x^k$.

void _nmod_poly_shift_right(mp_ptr res, mp_srcptr poly, long len, long k)

    Sets \code{(res, len - k)} to \code{(poly, len)} shifted left by 
    \code{k} coefficients. It is assumed that \code{k <= len} and that
    \code{res} has space for at least \code{len - k} coefficients.

void nmod_poly_shift_right(nmod_poly_t res, const nmod_poly_t poly, long k)

    Sets \code{res} to \code{poly} shifted right by \code{k} coefficients, 
    i.e.\ divide by $x^k$ and throws away the remainder. If \code{k} is 
    greater than or equal to the length of \code{poly}, the result is the 
    zero polynomial. 

*******************************************************************************

    Addition and subtraction

*******************************************************************************

void _nmod_poly_add(mp_ptr res, mp_srcptr poly1, long len1, 
                         mp_srcptr poly2, long len2, nmod_t mod)

    Sets \code{res} to the sum of \code{(poly1, len1)} and 
    \code{(poly2, len2)}. There are no restrictions on the lengths.

void nmod_poly_add(nmod_poly_t res, const nmod_poly_t poly1, 
                                            const nmod_poly_t poly2)

    Sets \code{res} to the sum of \code{poly1} and \code{poly2}.

void _nmod_poly_sub(mp_ptr res, mp_srcptr poly1, long len1, 
                         mp_srcptr poly2, long len2, nmod_t mod)

    Sets \code{res} to the difference of \code{(poly1, len1)} and 
    \code{(poly2, len2)}. There are no restrictions on the lengths.

void nmod_poly_sub(nmod_poly_t res, const nmod_poly_t poly1, 
                                    const nmod_poly_t poly2)

    Sets \code{res} to the difference of \code{poly1} and \code{poly2}.

void nmod_poly_neg(nmod_poly_t res, const nmod_poly_t poly)

    Sets \code{res} to the negation of \code{poly}.

*******************************************************************************

    Scalar multiplication and division

*******************************************************************************

void nmod_poly_scalar_mul_nmod(nmod_poly_t res, 
                          const nmod_poly_t poly, ulong c)

    Sets \code{res} to \code{(poly, len)} multiplied by~$c$, 
    where~$c$ is reduced modulo the modulus of \code{poly}.

void _nmod_poly_make_monic(mp_ptr output, 
                                      mp_srcptr input, long len, nmod_t mod)

    Sets \code{output} to be the scalar multiple of \code{input} of 
    length \code{len > 0} that has leading coefficient one, if such a 
    polynomial exists. If the leading coefficient of \code{input} is not
    invertible, \code{output} is set to the multiple of \code{input} whose
    leading coefficient is the greatest common divisor of the leading 
    coefficient and the modulus of \code{input}.

void nmod_poly_make_monic(nmod_poly_t output, const nmod_poly_t input)

    Sets \code{output} to be the scalar multiple of \code{input} with leading
    coefficient one, if such a polynomial exists. If \code{input} is zero
    an exception is raised. If the leading coefficient of \code{input} is not
    invertible, \code{output} is set to the multiple of \code{input} whose
    leading coefficient is the greatest common divisor of the leading 
    coefficient and the modulus of \code{input}.

*******************************************************************************

    Bit packing and unpacking

*******************************************************************************

void _nmod_poly_bit_pack(mp_ptr res, mp_srcptr poly, long len, 
                                                     mp_bitcnt_t bits)

    Packs \code{len} coefficients of \code{poly} into fields of the given 
    number of bits in the large integer \code{res}, i.e.\ evaluates 
    \code{poly} at \code{2^bits} and store the result in \code{res}. 
    Assumes \code{len > 0} and \code{bits > 0}. Also assumes that no 
    coefficient of \code{poly} is bigger than \code{bits/2} bits. We 
    also assume \code{bits < 3 * FLINT_BITS}.

void _nmod_poly_bit_unpack(mp_ptr res, long len, 
                                mp_srcptr mpn, ulong bits, nmod_t mod)

    Unpacks \code{len} coefficients stored in the big integer \code{mpn} 
    in bit fields of the given number of bits, reduces them modulo the 
    given modulus, then stores them in the polynomial \code{res}. 
    We assume \code{len > 0} and \code{3 * FLINT_BITS > bits > 0}.
    There are no restrictions on the size of the actual coefficients as 
    stored within the bitfields. 

void nmod_poly_bit_pack(fmpz_t f, const nmod_poly_t poly, mp_bitcnt_t bit_size)

    Packs \code{poly} into bitfields of size \code{bit_size}, writing the
    result to \code{f}.

void nmod_poly_bit_unpack(nmod_poly_t poly, const fmpz_t f,
        mp_bitcnt_t bit_size)

    Unpacks the polynomial from fields of size \code{bit_size} as
    represented by the integer \code{f}.


*******************************************************************************

    Multiplication

*******************************************************************************

void _nmod_poly_mul_classical(mp_ptr res, mp_srcptr poly1, 
                    long len1, mp_srcptr poly2, long len2, nmod_t mod)

    Sets \code{(res, len1 + len2 - 1)} to the product of \code{(poly1, len1)}
    and \code{(poly2, len2)}. Assumes \code{len1 >= len2 > 0}. Aliasing of 
    inputs and output is not permitted.

void nmod_poly_mul_classical(nmod_poly_t res, 
                             const nmod_poly_t poly1, const nmod_poly_t poly2)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}.

void _nmod_poly_mullow_classical(mp_ptr res, mp_srcptr poly1, long len1, 
                           mp_srcptr poly2, long len2, long trunc, nmod_t mod)

    Sets \code{res} to the lower \code{trunc} coefficients of the product of 
    \code{(poly1, len1)} and \code{(poly2, len2)}. Assumes that 
    \code{len1 >= len2 > 0} and \code{trunc > 0}. Aliasing of inputs and 
    output is not permitted.

void nmod_poly_mullow_classical(nmod_poly_t res, 
                  const nmod_poly_t poly1, const nmod_poly_t poly2, long trunc)

    Sets \code{res} to the lower \code{trunc} coefficients of the product 
    of \code{poly1} and \code{poly2}.

void _nmod_poly_mulhigh_classical(mp_ptr res, mp_srcptr poly1, 
            long len1, mp_srcptr poly2, long len2, long start, nmod_t mod)

    Computes the product of \code{(poly1, len1)} and \code{(poly2, len2)} 
    and writes the coefficients from \code{start} onwards into the high 
    coefficients of \code{res}, the remaining coefficients being arbitrary 
    but reduced.  Assumes that \code{len1 >= len2 > 0}. Aliasing of inputs 
    and output is not permitted.

void nmod_poly_mulhigh_classical(nmod_poly_t res, 
                  const nmod_poly_t poly1, const nmod_poly_t poly2, long start)

    Computes the product of \code{poly1} and \code{poly2} and writes the 
    coefficients from \code{start} onwards into the high coefficients of 
    \code{res}, the remaining coefficients being arbitrary but reduced.

void _nmod_poly_mul_KS(mp_ptr out, mp_srcptr in1, long len1, 
                     mp_srcptr in2, long len2, mp_bitcnt_t bits, nmod_t mod)

    Sets \code{res} to the product of \code{poly1} and \code{poly2} 
    assuming the output coefficients are at most the given number of 
    bits wide. If \code{bits} is set to $0$ an appropriate value is 
    computed automatically.  Assumes that \code{len1 >= len2 > 0}.

void nmod_poly_mul_KS(nmod_poly_t res, 
            const nmod_poly_t poly1, const nmod_poly_t poly2, mp_bitcnt_t bits)

    Sets \code{res} to the product of \code{poly1} and \code{poly2} 
    assuming the output coefficients are at most the given number of 
    bits wide. If \code{bits} is set to $0$ an appropriate value 
    is computed automatically.

void _nmod_poly_mullow_KS(mp_ptr out, mp_srcptr in1, long len1,
                mp_srcptr in2, long len2, mp_bitcnt_t bits, long n, nmod_t mod)

    Sets \code{out} to the low $n$ coefficients of \code{in1} of length
    \code{len1} times \code{in2} of length \code{len2}. The output must have
    space for \code{n} coefficients. We assume that \code{len1 >= len2 > 0}
    and that \code{0 < n <= len1 + len2 - 1}. 

void nmod_poly_mullow_KS(nmod_poly_t res, const nmod_poly_t poly1, 
                             const nmod_poly_t poly2, mp_bitcnt_t bits, long n)

    Set \code{res} to the low $n$ coefficients of \code{in1} of length
    \code{len1} times \code{in2} of length \code{len2}. 

void _nmod_poly_mul(mp_ptr res, mp_srcptr poly1, long len1, 
                                        mp_srcptr poly2, long len2, nmod_t mod)

    Sets \code{res} to the product of \code{poly1} of length \code{len1} 
    and \code{poly2} of length \code{len2}. Assumes \code{len1 >= len2 > 0}.
    No aliasing is permitted between the inputs and the output.

void nmod_poly_mul(nmod_poly_t res, 
                               const nmod_poly_t poly, const nmod_poly_t poly2)

    Sets \code{res} to the product of \code{poly1} and \code{poly2}.

void _nmod_poly_mullow(mp_ptr res, mp_srcptr poly1, long len1, 
                                mp_srcptr poly2, long len2, long n, nmod_t mod)

    Sets \code{res} to the first \code{n} coefficients of the 
    product of \code{poly1} of length \code{len1} and \code{poly2} of
    length \code{len2}. It is assumed that \code{0 < n <= len1 + len2 - 1} 
    and that \code{len1 >= len2 > 0}. No aliasing of inputs and output
    is permitted.

void nmod_poly_mullow(nmod_poly_t res, const nmod_poly_t poly1, 
                                           const nmod_poly_t poly2, long trunc)

    Sets \code{res} to the first \code{trunc} coefficients of the 
    product of \code{poly1} and \code{poly2}.

void _nmod_poly_mulhigh(mp_ptr res, mp_srcptr poly1, long len1, 
                                mp_srcptr poly2, long len2, long n, nmod_t mod)

    Sets all but the low $n$ coefficients of \code{res} to the 
    corresponding coefficients of the product of \code{poly1} of length 
    \code{len1} and \code{poly2} of length \code{len2}, the other 
    coefficients being arbitrary. It is assumed that 
    \code{len1 >= len2 > 0} and that \code{0 < n <= len1 + len2 - 1}. 
    Aliasing of inputs and output is not permitted.

void nmod_poly_mulhigh(nmod_poly_t res, const nmod_poly_t poly1, 
                                          const nmod_poly_t poly2, long n)

    Sets all but the low $n$ coefficients of \code{res} to the 
    corresponding coefficients of the product of \code{poly1} and 
    \code{poly2}, the remaining coefficients being arbitrary.

void _nmod_poly_mulmod(mp_ptr res, mp_srcptr poly1, long len1, 
                             mp_srcptr poly2, long len2, mp_srcptr f,
                            long lenf, nmod_t mod)

    Sets \code{res} to the remainder of the product of \code{poly1} and
    \code{poly2} upon polynomial division by \code{f}.

    It is required that \code{len1 + len2 - lenf > 0}, which is equivalent
    to requiring that the result will actually be reduced. Otherwise, simply
    use \code{_nmod_poly_mul} instead.

    Aliasing of \code{f} and \code{res} is not permitted.

void nmod_poly_mulmod(nmod_poly_t res,
    const nmod_poly_t poly1, const nmod_poly_t poly2, const nmod_poly_t f)

    Sets \code{res} to the remainder of the product of \code{poly1} and
    \code{poly2} upon polynomial division by \code{f}.

*******************************************************************************

    Powering

*******************************************************************************

void _nmod_poly_pow_binexp(mp_ptr res, 
                             mp_srcptr poly, long len, ulong e, nmod_t mod)

    Raises \code{poly} of length \code{len} to the power \code{e} and sets 
    \code{res} to the result. We require that \code{res} has enough space
    for \code{(len - 1)*e + 1} coefficients. Assumes that \code{len > 0}, 
    \code{e > 1}. Aliasing is not permitted. Uses the binary exponentiation
    method.

void nmod_poly_pow_binexp(nmod_poly_t res, const nmod_poly_t poly, ulong e)

    Raises \code{poly} to the power \code{e} and sets \code{res} to the 
    result. Uses the binary exponentiation method.

void _nmod_poly_pow(mp_ptr res,
                             mp_srcptr poly, long len, ulong e, nmod_t mod)

    Raises \code{poly} of length \code{len} to the power \code{e} and sets 
    \code{res} to the result. We require that \code{res} has enough space
    for \code{(len - 1)*e + 1} coefficients. Assumes that \code{len > 0}, 
    \code{e > 1}. Aliasing is not permitted. 

void nmod_poly_pow(nmod_poly_t res, const nmod_poly_t poly, ulong e)

    Raises \code{poly} to the power \code{e} and sets \code{res} to the 
    result.

void _nmod_poly_pow_trunc_binexp(mp_ptr res, mp_srcptr poly, 
                                           ulong e, long trunc, nmod_t mod)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    (assumed to be zero padded if necessary to length \code{trunc}) to 
    the power \code{e}. This is equivalent to doing a powering followed
    by a truncation. We require that \code{res} has enough space for
    \code{trunc} coefficients, that \code{trunc > 0} and that 
    \code{e > 1}. Aliasing is not permitted. Uses the binary 
    exponentiation method.

void nmod_poly_pow_trunc_binexp(nmod_poly_t res, 
                               const nmod_poly_t poly, ulong e, long trunc)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    to the power \code{e}. This is equivalent to doing a powering 
    followed by a truncation. Uses the binary exponentiation method.

void _nmod_poly_pow_trunc(mp_ptr res, mp_srcptr poly, 
                                           ulong e, long trunc, nmod_t mod)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    (assumed to be zero padded if necessary to length \code{trunc}) to 
    the power \code{e}. This is equivalent to doing a powering followed
    by a truncation. We require that \code{res} has enough space for
    \code{trunc} coefficients, that \code{trunc > 0} and that 
    \code{e > 1}. Aliasing is not permitted.

void nmod_poly_pow_trunc(nmod_poly_t res, 
                               const nmod_poly_t poly, ulong e, long trunc)

    Sets \code{res} to the low \code{trunc} coefficients of \code{poly}
    to the power \code{e}. This is equivalent to doing a powering 
    followed by a truncation.

void _nmod_poly_powmod_ui_binexp(mp_ptr res, mp_srcptr poly, 
                                ulong e, mp_srcptr f,
                                long lenf, nmod_t mod)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e > 0}.

    We require \code{lenf > 1}. It is assumed that \code{poly} is already
    reduced modulo \code{f} and zero-padded as necessary to have length
    exactly \code{lenf - 1}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void nmod_poly_powmod_ui_binexp(nmod_poly_t res, 
                           const nmod_poly_t poly, ulong e,
                           const nmod_poly_t f)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e >= 0}.

void _nmod_poly_powmod_mpz_binexp(mp_ptr res, mp_srcptr poly, 
                                mpz_srcptr e, mp_srcptr f,
                                long lenf, nmod_t mod)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e > 0}.

    We require \code{lenf > 1}. It is assumed that \code{poly} is already
    reduced modulo \code{f} and zero-padded as necessary to have length
    exactly \code{lenf - 1}. The output \code{res} must have room for
    \code{lenf - 1} coefficients.

void nmod_poly_powmod_mpz_binexp(nmod_poly_t res, 
                           const nmod_poly_t poly, mpz_srcptr e,
                           const nmod_poly_t f)

    Sets \code{res} to \code{poly} raised to the power \code{e}
    modulo \code{f}, using binary exponentiation. We require \code{e >= 0}.

*******************************************************************************

    Division

*******************************************************************************

void _nmod_poly_divrem_basecase(mp_ptr Q, mp_ptr R, mp_ptr W, 
           mp_srcptr A, long A_len, mp_srcptr B, long B_len, nmod_t mod)

    Finds $Q$ and $R$ such that $A = B Q + R$ with $\len(R) < \len(B)$.
    If $\len(B) = 0$ an exception is raised. We require that \code{W}
    is temporary space of \code{NMOD_DIVREM_BC_ITCH(A_len, B_len, mod)}
    coefficients.

void nmod_poly_divrem_basecase(nmod_poly_t Q, 
                       nmod_poly_t R, const nmod_poly_t A, const nmod_poly_t B)

    Finds $Q$ and $R$ such that $A = B Q + R$ with $\len(R) < \len(B)$. 
    If $\len(B) = 0$ an exception is raised.

void _nmod_poly_div_basecase(mp_ptr Q, mp_ptr W, mp_srcptr A, long A_len, 
                                          mp_srcptr B, long B_len, nmod_t mod);

    Notionally finds polynomials $Q$ and $R$ such that $A = B Q + R$ with 
    $\len(R) < \len(B)$, but returns only \code{Q}. If $\len(B) = 0$ an 
    exception is raised. We require that \code{W} is temporary space of 
    \code{NMOD_DIV_BC_ITCH(A_len, B_len, mod)} coefficients.

void nmod_poly_div_basecase(nmod_poly_t Q, const nmod_poly_t A,
                                                          const nmod_poly_t B);

    Notionally finds polynomials $Q$ and $R$ such that $A = B Q + R$ with 
    $\len(R) < \len(B)$, but returns only \code{Q}. If $\len(B) = 0$ an 
    exception is raised.

void _nmod_poly_divrem_divconquer_recursive(mp_ptr Q, mp_ptr BQ, mp_ptr W,  
                     mp_ptr V, mp_srcptr A, mp_srcptr B, long lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than 
    \code{lenB}, where \code{A} is of length \code{2 * lenB - 1} and \code{B} 
    is of length \code{lenB}. Sets \code{BQ} to the low \code{lenB - 1} 
    coefficients of \code{B * Q}. We require that \code{Q} have space for 
    \code{lenB} coefficients, that \code{W} be temporary space of size 
    \code{lenB - 1} and \code{V} be temporary space for a number of 
    coefficients computed by \code{NMOD_DIVREM_DC_ITCH(lenB, mod)}.

void _nmod_poly_divrem_divconquer(mp_ptr Q, mp_ptr R, 
                    mp_srcptr A, long lenA, mp_srcptr B, long lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than 
    \code{lenB}, where \code{A} is of length \code{lenA} and \code{B} is of 
    length \code{lenB}. We require that \code{Q} have space for 
    \code{lenA - lenB + 1} coefficients.

void nmod_poly_divrem_divconquer(nmod_poly_t Q, nmod_poly_t R,
                                      const nmod_poly_t A, const nmod_poly_t B)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$.

void _nmod_poly_divrem_q0(mp_ptr Q, mp_ptr R, 
                          mp_srcptr A, mp_srcptr B, long lenA, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$, 
    where $\len(A) = \len(B) > 0$.

    Requires that $Q$ and $R$ have space for $1$ and $\len(B) - 1$ 
    coefficients, respectively.

    Does not support aliasing or zero-padding.

void _nmod_poly_divrem_q1(mp_ptr Q, mp_ptr R, 
                          mp_srcptr A, long lenA, mp_srcptr B, long lenB,
                          nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$, 
    where $\len(A) = \len(B) + 1 \geq \len(B) > 0$.

    Requires that $Q$ and $R$ have space for $\len(A) - \len(B) + 1$ and 
    $\len(B) - 1$ coefficients, respectively.

    Does not support aliasing or zero-padding.

void _nmod_poly_divrem(mp_ptr Q, mp_ptr R, 
                    mp_srcptr A, long lenA, mp_srcptr B, long lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than 
    \code{lenB}, where \code{A} is of length \code{lenA} and \code{B} is of 
    length \code{lenB}. We require that \code{Q} have space for 
    \code{lenA - lenB + 1} coefficients.

void nmod_poly_divrem(nmod_poly_t Q, nmod_poly_t R,
                                      const nmod_poly_t A, const nmod_poly_t B)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$.

void _nmod_poly_div_divconquer_recursive(mp_ptr Q, mp_ptr W, mp_ptr V,
                               mp_srcptr A, mp_srcptr B, long lenB, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than 
    \code{lenB}, where \code{A} is of length \code{2 * lenB - 1} and \code{B} 
    is of length \code{lenB}. We require that \code{Q} have space for 
    \code{lenB} coefficients and that \code{W} be temporary space of size 
    \code{lenB - 1} and \code{V} be temporary space for a number of 
    coefficients computed by \code{NMOD_DIV_DC_ITCH(lenB, mod)}. 

void _nmod_poly_div_divconquer(mp_ptr Q, mp_srcptr A, long lenA, 
                                            mp_srcptr B, long lenB, nmod_t mod)

    Notionally computes polynomials $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R)$ less than \code{lenB}, where \code{A} is of length \code{lenA} 
    and \code{B} is of length \code{lenB}, but returns only \code{Q}. We 
    require that \code{Q} have space for \code{lenA - lenB + 1} coefficients.

void nmod_poly_div_divconquer(nmod_poly_t Q,
                                      const nmod_poly_t A, const nmod_poly_t B)

    Notionally computes $Q$ and $R$ such that $A = BQ + R$ with 
    $\len(R) < \len(B)$, but returns only $Q$.

void _nmod_poly_div(mp_ptr Q, mp_srcptr A, long lenA, 
                                            mp_srcptr B, long lenB, nmod_t mod)

    Notionally computes polynomials $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R)$ less than \code{lenB}, where \code{A} is of length \code{lenA} 
    and \code{B} is of length \code{lenB}, but returns only \code{Q}. We 
    require that \code{Q} have space for \code{lenA - lenB + 1} coefficients.


void nmod_poly_div(nmod_poly_t Q, const nmod_poly_t A, const nmod_poly_t B)

    Computes the quotient $Q$ on polynomial division of $A$ and $B$.

void _nmod_poly_rem_basecase(mp_ptr R, mp_ptr W, mp_srcptr A, long lenA, 
                                       mp_srcptr B, long lenB, nmod_t mod)

void nmod_poly_rem_basecase(nmod_poly_t R, 
                            const nmod_poly_t A, const nmod_poly_t B)

void _nmod_poly_rem_q1(mp_ptr R, 
                       mp_srcptr A, long lenA, mp_srcptr B, long lenB,
                       nmod_t mod)

    Notationally, computes $Q$ and $R$ such that $A = BQ + R$ with 
    $\len(R) < \len(B)$, where $\len(A) = \len(B) + 1 \geq \len(B) > 0$, 
    but returns only the remainder.

    Requires that $R$ has space for $\len(B) - 1$ coefficients, 
    respectively.

    Does not support aliasing or zero-padding.

void _nmod_poly_rem(mp_ptr R, mp_srcptr A, long lenA, 
                              mp_srcptr B, long lenB, nmod_t mod)

    Computes the remainder $R$ on polynomial division of $A$ by $B$.

void nmod_poly_rem(nmod_poly_t R, const nmod_poly_t A, const nmod_poly_t B)

    Computes the remainder $R$ on polynomial division of $A$ by $B$.

void _nmod_poly_inv_series_basecase(mp_ptr Qinv, 
                                    mp_srcptr Q, long n, nmod_t mod)
    
    Given \code{Q} of length \code{n} whose leading coefficient is invertible
    modulo the given modulus, finds a polynomial \code{Qinv} of length \code{n}
    such that the top \code{n} coefficients of the product \code{Q * Qinv} is
    $x^{n - 1}$. Requires that \code{n > 0}. This function can be viewed as 
    inverting a power series.

void nmod_poly_inv_series_basecase(nmod_poly_t Qinv, 
                                   const nmod_poly_t Q, long n)

    Given \code{Q} of length at least \code{n} find \code{Qinv} of length
    \code{n} such that the top \code{n} coefficients of the product 
    \code{Q * Qinv} is $x^{n - 1}$. An exception is raised if \code{n = 0}
    or if the length of \code{Q} is less than \code{n}. The leading 
    coefficient of \code{Q} must be invertible modulo the modulus of 
    \code{Q}. This function can be viewed as inverting a power series.

void _nmod_poly_inv_series_newton(mp_ptr Qinv, mp_srcptr Q, long n, nmod_t mod)
    
    Given \code{Q} of length \code{n} whose constant coefficient is invertible
    modulo the given modulus, find a polynomial \code{Qinv} of length \code{n}
    such that \code{Q * Qinv} is \code{1} modulo $x^n$. Requires \code{n > 0}.
    This function can be viewed as inverting a power series via Newton 
    iteration.

void nmod_poly_inv_series_newton(nmod_poly_t Qinv, const nmod_poly_t Q, long n)

    Given \code{Q} find \code{Qinv} such that \code{Q * Qinv} is \code{1}
    modulo $x^n$. The constant coefficient of \code{Q} must be invertible 
    modulo the modulus of \code{Q}. An exception is raised if this is not
    the case or if \code{n = 0}. This function can be viewed as inverting 
    a power series via Newton iteration.

void _nmod_poly_inv_series(mp_ptr Qinv, mp_srcptr Q, long n, nmod_t mod)
    
    Given \code{Q} of length \code{n} whose constant coefficient is invertible
    modulo the given modulus, find a polynomial \code{Qinv} of length \code{n}
    such that \code{Q * Qinv} is \code{1} modulo $x^n$. Requires \code{n > 0}.
    This function can be viewed as inverting a power series.

void nmod_poly_inv_series(nmod_poly_t Qinv, const nmod_poly_t Q, long n)

    Given \code{Q} find \code{Qinv} such that \code{Q * Qinv} is \code{1}
    modulo $x^n$. The constant coefficient of \code{Q} must be invertible 
    modulo the modulus of \code{Q}. An exception is raised if this is not
    the case or if \code{n = 0}. This function can be viewed as inverting 
    a power series.

void _nmod_poly_div_series(mp_ptr Q, mp_srcptr A, mp_srcptr B, 
                                                  long n, nmod_t mod)

    Given polynomials \code{A} and \code{B} of length \code{n}, finds the
    polynomial \code{Q} of length \code{n} such that \code{Q * B = A}
    modulo $x^n$. We assume \code{n > 0} and that the constant coefficient 
    of \code{B} is invertible modulo the given modulus. The polynomial 
    \code{Q} must have space for \code{n} coefficients.

void nmod_poly_div_series(nmod_poly_t Q, const nmod_poly_t A, 
                                         const nmod_poly_t B, long n)

    Given polynomials \code{A} and \code{B} considered modulo \code{n}, 
    finds the polynomial \code{Q} of length at most \code{n} such that
    \code{Q * B = A} modulo $x^n$. We assume \code{n > 0} and that the
    constant coefficient of \code{B} is invertible modulo the modulus. 
    An exception is raised if \code{n == 0} or the constant coefficient
    of \code{B} is zero.

void _nmod_poly_div_newton(mp_ptr Q, mp_srcptr A, long Alen, 
                                     mp_srcptr B, long Blen, nmod_t mod)

    Notionally computes polynomials $Q$ and $R$ such that $A = BQ + R$ with
    $\len(R)$ less than \code{lenB}, where \code{A} is of length \code{lenA} 
    and \code{B} is of length \code{lenB}, but return only $Q$.

    We require that $Q$ have space for \code{lenA - lenB + 1} coefficients 
    and assume that the leading coefficient of $B$ is a unit.

    The algorithm used is to reverse the polynomials and divide the 
    resulting power series, then reverse the result.

void nmod_poly_div_newton(nmod_poly_t Q, const nmod_poly_t A,
                                         const nmod_poly_t B)

    Notionally computes $Q$ and $R$ such that $A = BQ + R$ with 
    $\len(R) < \len(B)$, but returns only $Q$.

    We assume that the leading coefficient of $B$ is a unit.

    The algorithm used is to reverse the polynomials and divide the 
    resulting power series, then reverse the result.

void _nmod_poly_divrem_newton(mp_ptr Q, mp_ptr R, mp_srcptr A, long Alen, 
                                        mp_srcptr B, long Blen, nmod_t mod)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R)$ less than 
    \code{lenB}, where $A$ is of length \code{lenA} and $B$ is of length 
    \code{lenB}. We require that $Q$ have space for \code{lenA - lenB + 1}
    coefficients. The algorithm used is to call \code{div_newton()} and then
    multiply out and compute the remainder.

void nmod_poly_divrem_newton(nmod_poly_t Q, nmod_poly_t R, 
                             const nmod_poly_t A, const nmod_poly_t B)

    Computes $Q$ and $R$ such that $A = BQ + R$ with $\len(R) < \len(B)$. 
    The algorithm used is to call \code{div_newton()} and then multiply out 
    and compute the remainder.

mp_limb_t _nmod_poly_div_root(mp_ptr Q, mp_srcptr A, long len,
                                mp_limb_t c, nmod_t mod)

    Sets \code{(Q, len-1)} to the quotient of \code{(A, len)} on division
    by $(x - c)$, and returns the remainder, equal to the value of $A$
    evaluated at $c$. $A$ and $Q$ are allowed to be the same, but may
    not overlap partially in any other way.

mp_limb_t nmod_poly_div_root(nmod_poly_t Q, const nmod_poly_t A, mp_limb_t c)

    Sets $Q$ to the quotient of $A$ on division by $(x - c)$, and returns
    the remainder, equal to the value of $A$ evaluated at $c$.

*******************************************************************************

    Derivative and integral

*******************************************************************************

void _nmod_poly_derivative(mp_ptr x_prime, mp_srcptr x, long len, nmod_t mod)

    Sets the first \code{len - 1} coefficients of \code{x_prime} to the 
    derivative of \code{x} which is assumed to be of length \code{len}. 
    It is assumed that \code{len > 0}.

void nmod_poly_derivative(nmod_poly_t x_prime, const nmod_poly_t x)

    Sets \code{x_prime} to the derivative of \code{x}. 

void _nmod_poly_integral(mp_ptr x_int, mp_srcptr x, long len, nmod_t mod)

    Set the first \code{len} coefficients of \code{x_int} to the 
    integral of \code{x} which is assumed to be of length \code{len - 1}.
    The constant term of \code{x_int} is set to zero.
    It is assumed that \code{len > 0}. The result is only well-defined
    if the modulus is a prime number strictly larger than the degree of
    \code{x}.

void nmod_poly_integral(nmod_poly_t x_int, const nmod_poly_t x)

    Set \code{x_int} to the indefinite integral of \code{x} with constant
    term zero. The result is only well-defined if the modulus
    is a prime number strictly larger than the degree of \code{x}.


*******************************************************************************

    Evaluation

*******************************************************************************

mp_limb_t _nmod_poly_evaluate_nmod(mp_srcptr poly, long len, mp_limb_t c, 
                                   nmod_t mod)

    Evaluates \code{poly} at the value~\code{c} and reduces modulo the 
    given modulus of \code{poly}. The value~\code{c} should be reduced 
    modulo the modulus. The algorithm used is Horner's method.

mp_limb_t nmod_poly_evaluate_nmod(nmod_poly_t poly, mp_limb_t c)

    Evaluates \code{poly} at the value~\code{c} and reduces modulo the 
    modulus of \code{poly}. The value~\code{c} should be reduced modulo
    the modulus. The algorithm used is Horner's method.

*******************************************************************************

    Multipoint evaluation

*******************************************************************************

void _nmod_poly_evaluate_nmod_vec_iter(mp_ptr ys, mp_srcptr poly, long len,
                                    mp_srcptr xs, long n, nmod_t mod)

    Evaluates (\code{coeffs}, \code{len}) at the \code{n} values
    given in the vector \code{xs}, writing the output values
    to \code{ys}. The values in \code{xs} should be reduced
    modulo the modulus.

    Uses Horner's method iteratively.

void nmod_poly_evaluate_nmod_vec_iter(mp_ptr ys, const nmod_poly_t poly,
                                    mp_srcptr xs, long n)

    Evaluates \code{poly} at the \code{n} values given in the vector
    \code{xs}, writing the output values to \code{ys}. The values in
    \code{xs} should be reduced modulo the modulus.

    Uses Horner's method iteratively.

void _nmod_poly_evaluate_nmod_vec_fast_precomp(mp_ptr vs, mp_srcptr poly,
    long plen, mp_ptr * tree, long len, nmod_t mod)

    Evaluates (\code{poly}, \code{plen}) at the \code{len} values given
    by the precomputed subproduct tree \code{tree}.

void _nmod_poly_evaluate_nmod_vec_fast(mp_ptr ys, mp_srcptr poly,
        long len, mp_srcptr xs, long n, nmod_t mod)

    Evaluates (\code{coeffs}, \code{len}) at the \code{n} values
    given in the vector \code{xs}, writing the output values
    to \code{ys}. The values in \code{xs} should be reduced
    modulo the modulus.

    Uses fast multipoint evaluation, building a temporary subproduct tree.

void nmod_poly_evaluate_nmod_vec_fast(mp_ptr ys, const nmod_poly_t poly,
                                    mp_srcptr xs, long n)

    Evaluates \code{poly} at the \code{n} values given in the vector
    \code{xs}, writing the output values to \code{ys}. The values in
    \code{xs} should be reduced modulo the modulus.

    Uses fast multipoint evaluation, building a temporary subproduct tree.


void _nmod_poly_evaluate_nmod_vec(mp_ptr ys, mp_srcptr poly, long len,
                                    mp_srcptr xs, long n, nmod_t mod)

    Evaluates (\code{coeffs}, \code{len}) at the \code{n} values
    given in the vector \code{xs}, writing the output values
    to \code{ys}. The values in \code{xs} should be reduced
    modulo the modulus.

void nmod_poly_evaluate_nmod_vec(mp_ptr ys, const nmod_poly_t poly,
                                    mp_srcptr xs, long n)

    Evaluates \code{poly} at the \code{n} values given in the vector
    \code{xs}, writing the output values to \code{ys}. The values in
    \code{xs} should be reduced modulo the modulus.

*******************************************************************************

    Interpolation

*******************************************************************************

void _nmod_poly_interpolate_nmod_vec(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, long n, nmod_t mod)

    Sets \code{poly} to the unique polynomial of length at most \code{n}
    that interpolates the \code{n} given evaluation points \code{xs} and
    values \code{ys}. If the interpolating polynomial is shorter than
    length \code{n}, the leading coefficients are set to zero.

    The values in \code{xs} and \code{ys} should be reduced modulo the
    modulus, and all \code{xs} must be distinct. Aliasing between
    \code{poly} and \code{xs} or \code{ys} is not allowed.

void nmod_poly_interpolate_nmod_vec(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, long n)

    Sets \code{poly} to the unique polynomial of length \code{n} that
    interpolates the \code{n} given evaluation points \code{xs} and
    values \code{ys}. The values in \code{xs} and \code{ys} should be
    reduced modulo the modulus, and all \code{xs} must be distinct.

void _nmod_poly_interpolation_weights(mp_ptr w, mp_ptr * tree, long len,
        nmod_t mod)

    Sets \code{w} to the barycentric interpolation weights for fast
    Lagrange interpolation with respect to a given subproduct tree.

void _nmod_poly_interpolate_nmod_vec_fast_precomp(mp_ptr poly, mp_srcptr ys,
    mp_ptr * tree, mp_srcptr weights, long len, nmod_t mod)

    Performs interpolation using the fast Lagrange interpolation
    algorithm, generating a temporary subproduct tree.

    The function values are given as \code{ys}. The function takes
    a precomputed subproduct tree \code{tree} and barycentric
    interpolation weights \code{weights} corresponding to the
    roots.

void _nmod_poly_interpolate_nmod_vec_fast(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, long n, nmod_t mod)

    Performs interpolation using the fast Lagrange interpolation
    algorithm, generating a temporary subproduct tree.

void nmod_poly_interpolate_nmod_vec_fast(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, long n)

    Performs interpolation using the fast Lagrange interpolation algorithm,
    generating a temporary subproduct tree.

void _nmod_poly_interpolate_nmod_vec_newton(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, long n, nmod_t mod)

    Forms the interpolating polynomial in the Newton basis using
    the method of divided differences and then converts it to
    monomial form.

void nmod_poly_interpolate_nmod_vec_newton(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, long n)

    Forms the interpolating polynomial in the Newton basis using
    the method of divided differences and then converts it to
    monomial form.

void _nmod_poly_interpolate_nmod_vec_barycentric(mp_ptr poly,
                            mp_srcptr xs, mp_srcptr ys, long n, nmod_t mod)

    Forms the interpolating polynomial using a naive implementation
    of the barycentric form of Lagrange interpolation.

void nmod_poly_interpolate_nmod_vec_barycentric(nmod_poly_t poly,
                                    mp_srcptr xs, mp_srcptr ys, long n)

    Forms the interpolating polynomial using a naive implementation
    of the barycentric form of Lagrange interpolation.


*******************************************************************************

    Composition

*******************************************************************************

void _nmod_poly_compose_horner(mp_ptr res, mp_srcptr poly1, long len1, 
                               mp_srcptr poly2, long len2, nmod_t mod)

    Composes \code{poly1} of length \code{len1} with \code{poly2} of length 
    \code{len2} and sets \code{res} to the result, i.e.\ evaluates 
    \code{poly1} at \code{poly2}. The algorithm used is Horner's algorithm. 
    We require that \code{res} have space for \code{(len1 - 1)*(len2 - 1) + 1}
    coefficients. It is assumed that \code{len1 > 0} and \code{len2 > 0}.

void nmod_poly_compose_horner(nmod_poly_t res, 
                              const nmod_poly_t poly1, const nmod_poly_t poly2)

    Composes \code{poly1} with \code{poly2} and sets \code{res} to the result,
    i.e.\ evaluates \code{poly1} at \code{poly2}. The algorithm used is 
    Horner's algorithm.

void _nmod_poly_compose_divconquer(mp_ptr res, mp_srcptr poly1, long len1, 
                                   mp_srcptr poly2, long len2, nmod_t mod)

    Composes \code{poly1} of length \code{len1} with \code{poly2} of length 
    \code{len2} and sets \code{res} to the result, i.e.\ evaluates 
    \code{poly1} at \code{poly2}. The algorithm used is the divide and 
    conquer algorithm. We require that \code{res} have space for 
    \code{(len1 - 1)*(len2 - 1) + 1} coefficients. It is assumed that 
    \code{len1 > 0} and \code{len2 > 0}.

void nmod_poly_compose_divconquer(nmod_poly_t res, 
                              const nmod_poly_t poly1, const nmod_poly_t poly2)

    Composes \code{poly1} with \code{poly2} and sets \code{res} to the result,
    i.e.\ evaluates \code{poly1} at \code{poly2}. The algorithm used is 
    the divide and conquer algorithm.

void _nmod_poly_compose(mp_ptr res, mp_srcptr poly1, long len1, 
                                        mp_srcptr poly2, long len2, nmod_t mod)

    Composes \code{poly1} of length \code{len1} with \code{poly2} of length
    \code{len2} and sets \code{res} to the result, i.e.\ evaluates \code{poly1}
    at \code{poly2}. We require that \code{res} have space for 
    \code{(len1 - 1)*(len2 - 1) + 1} coefficients. It is assumed that 
    \code{len1 > 0} and \code{len2 > 0}.

void nmod_poly_compose(nmod_poly_t res, 
                              const nmod_poly_t poly1, const nmod_poly_t poly2)

    Composes \code{poly1} with \code{poly2} and sets \code{res} to the result,
    that is, evaluates \code{poly1} at \code{poly2}.

*******************************************************************************

    Taylor shift

*******************************************************************************

void _nmod_poly_taylor_shift_horner(mp_ptr poly, mp_limb_t c,
    long len, nmod_t mod)

    Performs the Taylor shift composing \code{poly} by $x+c$ in-place.
    Uses an efficient version Horner's rule.

void nmod_poly_taylor_shift_horner(nmod_poly_t g,
    const nmod_poly_t f, mp_limb_t c)

    Performs the Taylor shift composing \code{f} by $x+c$.

void _nmod_poly_taylor_shift_convolution(mp_ptr poly, mp_limb_t c,
    long len, nmod_t mod)

    Performs the Taylor shift composing \code{poly} by $x+c$ in-place.
    Writes the composition as a single convolution with cost $O(M(n))$.
    We require that the modulus is a prime at least as large as the length.

void nmod_poly_taylor_shift_convolution(nmod_poly_t g,
    const nmod_poly_t f, mp_limb_t c)

    Performs the Taylor shift composing \code{f} by $x+c$.
    Writes the composition as a single convolution with cost $O(M(n))$.
    We require that the modulus is a prime at least as large as the length.

void _nmod_poly_taylor_shift(mp_ptr poly, mp_limb_t c, long len, nmod_t mod)

    Performs the Taylor shift composing \code{poly} by $x+c$ in-place.
    We require that the modulus is a prime.

void nmod_poly_taylor_shift(nmod_poly_t g, const nmod_poly_t f, mp_limb_t c)

    Performs the Taylor shift composing \code{f} by $x+c$.
    We require that the modulus is a prime.

*******************************************************************************

    Modular composition

*******************************************************************************

void _nmod_poly_compose_mod_horner(mp_ptr res,
    mp_srcptr f, long lenf, mp_srcptr g, mp_srcptr h, long lenh, nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that the length of $g$ is one less than the
    length of $h$ (possibly with zero padding). The output is not allowed
    to be aliased with any of the inputs.

    The algorithm used is Horner's rule.

void nmod_poly_compose_mod_horner(nmod_poly_t res, 
                    const nmod_poly_t f, const nmod_poly_t g,
                    const nmod_poly_t h)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero. The algorithm used is Horner's rule.


void _nmod_poly_compose_mod_brent_kung(mp_ptr res,
    mp_srcptr f, long lenf, mp_srcptr g, mp_srcptr h, long lenh, nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that the length of $g$ is one less than the
    length of $h$ (possibly with zero padding). We also require that
    the length of $f$ is less than the length of $h$. The output is not allowed
    to be aliased with any of the inputs.

    The algorithm used is the Brent-Kung matrix algorithm.

void nmod_poly_compose_mod_brent_kung(nmod_poly_t res, 
                    const nmod_poly_t f, const nmod_poly_t g,
                    const nmod_poly_t h)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that $f$ has smaller degree than $h$.
    The algorithm used is the Brent-Kung matrix algorithm.

void _nmod_poly_compose_mod(mp_ptr res,
    mp_srcptr f, long lenf, mp_srcptr g, mp_srcptr h, long lenh, nmod_t mod)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero and that the length of $g$ is one less than the
    length of $h$ (possibly with zero padding). The output is not allowed
    to be aliased with any of the inputs.

void nmod_poly_compose_mod(nmod_poly_t res, 
                    const nmod_poly_t f, const nmod_poly_t g,
                    const nmod_poly_t h)

    Sets \code{res} to the composition $f(g)$ modulo $h$. We require that
    $h$ is nonzero.


*******************************************************************************

    Greatest common divisor

*******************************************************************************

long _nmod_poly_gcd_euclidean(mp_ptr G, 
                    mp_srcptr A, long lenA, mp_srcptr B, long lenB, nmod_t mod)

    Computes the GCD of $A$ of length \code{lenA} and $B$ of length
    \code{lenB}, where \code{lenA >= lenB > 0}. The length of the GCD $G$
    is returned by the function. No attempt is made to make the GCD monic. It
    is required that $G$ have space for \code{lenB} coefficients.

void nmod_poly_gcd_euclidean(nmod_poly_t G, 
                             const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

long _nmod_poly_hgcd(mp_ptr *M, long *lenM, 
                     mp_ptr A, long *lenA, mp_ptr B, long *lenB, 
                     mp_srcptr a, long lena, mp_srcptr b, long lenb, 
                     nmod_t mod)

    Computes the HGCD of $a$ and $b$, that is, a matrix~$M$, a sign~$\sigma$ 
    and two polynomials $A$ and $B$ such that 
    \begin{equation*}
    (A,B)^t = \sigma M^{-1} (a,b)^t.
    \end{equation*}

    Assumes that $\len(a) > \len(b) > 0$.

    Assumes that $A$ and $B$ have space of size at least $\len(a)$ 
    and $\len(b)$, respectively.  On exit, \code{*lenA} and \code{*lenB} 
    will contain the correct lengths of $A$ and $B$.

    Assumes that \code{M[0]}, \code{M[1]}, \code{M[2]}, and \code{M[3]} 
    each point to a vector of size at least $\len(a)$.

long _nmod_poly_gcd_hgcd(mp_ptr G, mp_srcptr A, long lenA, 
                                   mp_srcptr B, long lenB, nmod_t mod)

    Computes the monic GCD of $A$ and $B$, assuming that 
    $\len(A) \geq \len(B) > 0$.

    Assumes that $G$ has space for $\len(B)$ coefficients and 
    returns the length of $G$ on output.

void nmod_poly_gcd_hgcd(nmod_poly_t G, 
                        const nmod_poly_t A, const nmod_poly_t B)

    Computes the monic GCD of $A$ and $B$ using the HGCD algorithm.

    As a special case, the GCD of two zero polynomials is defined to be 
    the zero polynomial.

    The time complexity of the algorithm is $\mathcal{O}(n \log^2 n)$. 
    For further details, see~\citep{ThullYap1990}.

long _nmod_poly_gcd(mp_ptr G, 
                    mp_srcptr A, long lenA, mp_srcptr B, long lenB, nmod_t mod)

    Computes the GCD of $A$ of length \code{lenA} and $B$ of length
    \code{lenB}, where \code{lenA >= lenB > 0}. The length of the GCD $G$
    is returned by the function. No attempt is made to make the GCD monic. It
    is required that $G$ have space for \code{lenB} coefficients.

void nmod_poly_gcd(nmod_poly_t G, 
                             const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

long _nmod_poly_xgcd_euclidean(mp_ptr G, mp_ptr S, mp_ptr T, 
             mp_srcptr A, long A_len, mp_srcptr B, long B_len, nmod_t mod)

    Computes the GCD of $A$ and $B$ together with cofactors $S$ and $T$ 
    such that $S A + T B = G$.  Returns the length of $G$.

    Assumes that $\len(A) \geq \len(B) \geq 1$ and 
    $(\len(A),\len(B)) \neq (1,1)$.

    No attempt is made to make the GCD monic.

    Requires that $G$ have space for $\len(B)$ coefficients.  Writes 
    $\len(B)-1$ and $\len(A)-1$ coefficients to $S$ and $T$, respectively. 
    Note that, in fact, $\len(S) \leq \max(\len(B) - \len(G), 1)$ and 
    $\len(T) \leq \max(\len(A) - \len(G), 1)$.

    No aliasing of input and output operands is permitted.

void nmod_poly_xgcd_euclidean(nmod_poly_t G, nmod_poly_t S, nmod_poly_t T,
                                    const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

    Polynomials \code{S} and \code{T} are computed such that 
    \code{S*A + T*B = G}. The length of \code{S} will be at most 
    \code{lenB} and the length of \code{T} will be at most \code{lenA}.

long _nmod_poly_xgcd_hgcd(mp_ptr G, mp_ptr S, mp_ptr T, 
             mp_srcptr A, long A_len, mp_srcptr B, long B_len, nmod_t mod)

    Computes the GCD of $A$ and $B$, where $\len(A) \geq \len(B) > 0$, 
    together with cofactors $S$ and $T$ such that $S A + T B = G$. Returns 
    the length of $G$.

    No attempt is made to make the GCD monic.

    Requires that $G$ have space for $\len(B)$ coefficients.  Writes 
    $\len(B) - 1$ and $\len(A) - 1$ coefficients to $S$ and $T$, 
    respectively.  Note that, in fact, $\len(S) \leq \len(B) - \len(G)$ 
    and $\len(T) \leq \len(A) - \len(G)$.

    No aliasing of input and output operands is permitted.

void nmod_poly_xgcd_hgcd(nmod_poly_t G, nmod_poly_t S, nmod_poly_t T,
                         const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

    Polynomials \code{S} and \code{T} are computed such that 
    \code{S*A + T*B = G}. The length of \code{S} will be at most 
    \code{lenB} and the length of \code{T} will be at most \code{lenA}.

long _nmod_poly_xgcd(mp_ptr G, mp_ptr S, mp_ptr T, 
                     mp_srcptr A, long lenA, mp_srcptr B, long lenB, 
                     nmod_t mod)

    Computes the GCD of $A$ and $B$, where $\len(A) \geq \len(B) > 0$, 
    together with cofactors $S$ and $T$ such that $S A + T B = G$. Returns 
    the length of $G$.

    No attempt is made to make the GCD monic.

    Requires that $G$ have space for $\len(B)$ coefficients.  Writes 
    $\len(B) - 1$ and $\len(A) - 1$ coefficients to $S$ and $T$, 
    respectively.  Note that, in fact, $\len(S) \leq \len(B) - \len(G)$ 
    and $\len(T) \leq \len(A) - \len(G)$.

    No aliasing of input and output operands is permitted.

void nmod_poly_xgcd(nmod_poly_t G, nmod_poly_t S, nmod_poly_t T,
                                    const nmod_poly_t A, const nmod_poly_t B)

    Computes the GCD of $A$ and $B$. The GCD of zero polynomials is
    defined to be zero, whereas the GCD of the zero polynomial and some other
    polynomial $P$ is defined to be $P$. Except in the case where
    the GCD is zero, the GCD $G$ is made monic.

    The polynomials \code{S} and \code{T} are set such that 
    \code{S*A + T*B = G}. The length of \code{S} will be at most 
    \code{lenB} and the length of \code{T} will be at most \code{lenA}.

mp_limb_t 
_nmod_poly_resultant_euclidean(mp_srcptr poly1, long len1, 
                               mp_srcptr poly2, long len2, nmod_t mod)

    Returns the resultant of \code{(poly1, len1)} and 
    \code{(poly2, len2)} using the Euclidean algorithm.

    Assumes that \code{len1 >= len2 > 0}.

    Asumes that the modulus is prime.

mp_limb_t 
nmod_poly_resultant_euclidean(const nmod_poly_t f, const nmod_poly_t g)

    Computes the resultant of $f$ and $g$ using the Euclidean algorithm.

    For two non-zero polynomials $f(x) = a_m x^m + \dotsb + a_0$ and 
    $g(x) = b_n x^n + \dotsb + b_0$ of degrees $m$ and $n$, the resultant 
    is defined to be 
    \begin{equation*}
        a_m^n b_n^m \prod_{(x, y) : f(x) = g(y) = 0} (x - y).
    \end{equation*}
    For convenience, we define the resultant to be equal to zero if either 
    of the two polynomials is zero.

mp_limb_t 
_nmod_poly_resultant(mp_srcptr poly1, long len1, 
                     mp_srcptr poly2, long len2, nmod_t mod)

    Returns the resultant of \code{(poly1, len1)} and 
    \code{(poly2, len2)}.

    Assumes that \code{len1 >= len2 > 0}.

    Asumes that the modulus is prime.

mp_limb_t 
nmod_poly_resultant(const nmod_poly_t f, const nmod_poly_t g)

    Computes the resultant of $f$ and $g$.

    For two non-zero polynomials $f(x) = a_m x^m + \dotsb + a_0$ and 
    $g(x) = b_n x^n + \dotsb + b_0$ of degrees $m$ and $n$, the resultant 
    is defined to be 
    \begin{equation*}
        a_m^n b_n^m \prod_{(x, y) : f(x) = g(y) = 0} (x - y).
    \end{equation*}
    For convenience, we define the resultant to be equal to zero if either 
    of the two polynomials is zero.

*******************************************************************************

    Power series composition

*******************************************************************************

void _nmod_poly_compose_series_horner(mp_ptr res,
        mp_srcptr poly1, long len1, mp_srcptr poly2, long len2, long n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    Assumes that \code{len1, len2, n > 0}, that \code{len1, len2 <= n},
    and that \code{(len1-1) * (len2-1) + 1 <= n}, and that \code{res} has
    space for \code{n} coefficients. Does not support aliasing between any
    of the inputs and the output.

    This implementation uses the Horner scheme.

void nmod_poly_compose_series_horner(nmod_poly_t res, 
                    const nmod_poly_t poly1, const nmod_poly_t poly2, long n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    This implementation uses the Horner scheme.

void _nmod_poly_compose_series_brent_kung(mp_ptr res, mp_srcptr poly1,
        long len1, mp_srcptr poly2, long len2, long n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    Assumes that \code{len1, len2, n > 0}, that \code{len1, len2 <= n},
    and that\\ \code{(len1-1) * (len2-1) + 1 <= n}, and that \code{res} has
    space for \code{n} coefficients. Does not support aliasing between any
    of the inputs and the output.

    This implementation uses Brent-Kung algorithm 2.1 \cite{BrentKung1978}.

void nmod_poly_compose_series_brent_kung(nmod_poly_t res, 
                const nmod_poly_t poly1, const nmod_poly_t poly2, long n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    This implementation uses Brent-Kung algorithm 2.1 \cite{BrentKung1978}.

void _nmod_poly_compose_series_divconquer(mp_ptr res, 
    mp_srcptr poly1, long len1, mp_srcptr poly2, long len2, long N, nmod_t mod)

    Composes \code{poly1} of length $\ell_1$ with \code{poly2} of 
    length $\ell_2$ modulo $x^N$ and sets \code{res} to the result, 
    i.e.\ evaluates \code{poly1} at \code{poly2}.

    Writes $\min\{(\ell_1 - 1)(\ell_2 - 2) + 1, N\}$ coefficients 
    to the vector \code{res}.

    The algorithm used is the divide and conquer algorithm. 
    It is assumed that $0 < \ell_1$ and $0 < \ell_2 \leq N$.

    Does not support aliasing between the inputs and the output.

void nmod_poly_compose_series_divconquer(nmod_poly_t res, 
    const nmod_poly_t poly1, const nmod_poly_t poly2, long N)

    Composes \code{poly1} with \code{poly2} modulo $x^N$ and sets \code{res} 
    to the result, i.e.\ evaluates \code{poly1} at \code{poly2}.

    The algorithm used is the divide and conquer algorithm.

void _nmod_poly_compose_series(mp_ptr res, mp_srcptr poly1, long len1, 
                                      mp_srcptr poly2, long len2, long n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    Assumes that \code{len1, len2, n > 0}, that \code{len1, len2 <= n},
    and that\\ \code{(len1-1) * (len2-1) + 1 <= n}, and that \code{res} has
    space for \code{n} coefficients. Does not support aliasing between any
    of the inputs and the output.

    This implementation automatically switches between the Horner scheme
    and Brent-Kung algorithm 2.1 depending on the size of the inputs.

void nmod_poly_compose_series(nmod_poly_t res, 
                    const nmod_poly_t poly1, const nmod_poly_t poly2, long n)

    Sets \code{res} to the composition of \code{poly1} and \code{poly2}
    modulo $x^n$, where the constant term of \code{poly2} is required
    to be zero.

    This implementation automatically switches between the Horner scheme
    and Brent-Kung algorithm 2.1 depending on the size of the inputs.

*******************************************************************************

    Power series reversion

*******************************************************************************

void _nmod_poly_revert_series_lagrange(mp_ptr Qinv, mp_srcptr Q,
        long n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses the Lagrange inversion formula.

void nmod_poly_revert_series_lagrange(nmod_poly_t Qinv,
            const nmod_poly_t Q, long n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses the Lagrange inversion formula.

void _nmod_poly_revert_series_lagrange_fast(mp_ptr Qinv, mp_srcptr Q,
        long n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses a reduced-complexity implementation
    of the Lagrange inversion formula.

void nmod_poly_revert_series_lagrange_fast(nmod_poly_t Qinv,
            const nmod_poly_t Q, long n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses a reduced-complexity implementation
    of the Lagrange inversion formula.

void _nmod_poly_revert_series_newton(mp_ptr Qinv, mp_srcptr Q,
    long n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses Newton iteration \cite{BrentKung1978}.

void nmod_poly_revert_series_newton(nmod_poly_t Qinv,
        const nmod_poly_t Q, long n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation uses Newton iteration \cite{BrentKung1978}.

void _nmod_poly_revert_series(mp_ptr Qinv, mp_srcptr Q, long n, nmod_t mod)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$. The arguments must
    both have length \code{n} and may not be aliased.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation automatically chooses between the Lagrange
    inversion formula and Newton iteration based on the size of the
    input.

void nmod_poly_revert_series(nmod_poly_t Qinv, const nmod_poly_t Q, long n)

    Sets \code{Qinv} to the compositional inverse or reversion of \code{Q}
    as a power series, i.e. computes $Q^{-1}$ such that
    $Q(Q^{-1}(x)) = Q^{-1}(Q(x)) = x \bmod x^n$.

    It is required that $Q_0 = 0$ and that $Q_1$ as well as the integers
    $1, 2, \ldots, n-1$ are invertible modulo the modulus.

    This implementation automatically chooses between the Lagrange
    inversion formula and Newton iteration based on the size of the
    input.

*******************************************************************************

    Square roots

    The series expansions for $\sqrt{h}$ and $1/\sqrt{h}$ are defined
    by means of the generalised binomial theorem
    $$h^r = (1+y)^r =
        \sum_{k=0}^{\infty} {r \choose k} y^k.$$
    It is assumed that $h$ has constant term $1$ and that the coefficients
    $2^{-k}$ exist in the coefficient ring (i.e. $2$ must be invertible).


*******************************************************************************

void _nmod_poly_invsqrt_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set the first $n$ terms of $g$ to the series expansion of $1/\sqrt{h}$.
    It is assumed that $n > 0$, that $h$ has constant term 1 and that $h$
    is zero-padded as necessary to length $n$. Aliasing is not permitted.

void nmod_poly_invsqrt_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g$ to the series expansion of $1/\sqrt{h}$ to order $O(x^n)$.
    It is assumed that $h$ has constant term 1.

void _nmod_poly_sqrt_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set the first $n$ terms of $g$ to the series expansion of $\sqrt{h}$.
    It is assumed that $n > 0$, that $h$ has constant term 1 and that $h$
    is zero-padded as necessary to length $n$. Aliasing is not permitted.

void nmod_poly_sqrt_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g$ to the series expansion of $\sqrt{h}$ to order $O(x^n)$.
    It is assumed that $h$ has constant term 1.

void _nmod_poly_sqrt(mp_ptr s, mp_srcptr p, long n, nmod_t mod)

    If \code{(p, len)} is a perfect square, sets \code{(s, n / 2 + 1)}
    to a square root of $p$ and returns 1. Otherwise returns 0.

void nmod_poly_sqrt(nmod_poly_t s, const nmod_poly_t p, long n)

    If $p$ is a perfect square, sets $s$ to a square root of $a$
    and returns 1. Otherwise returns 0.

*******************************************************************************

    Transcendental functions

    The elementary transcendental functions of a formal power series $h$
    are defined as

    $$\exp(h(x)) = \sum_{k=0}^{\infty} \frac{(h(x))^k}{k!}$$

    $$\log(h(x)) = \int_0^x \frac{h'(t)}{h(t)} dt$$

    $$\operatorname{atan}(h(x)) = \int_0^x\frac{h'(t)}{1+(h(t))^2} dt$$

    $$\operatorname{atanh}(h(x)) = \int_0^x\frac{h'(t)}{1-(h(t))^2} dt$$

    $$\operatorname{asin}(h(x)) = \int_0^x\frac{h'(t)}{\sqrt{1-(h(t))^2}} dt$$

    $$\operatorname{asinh}(h(x)) = \int_0^x\frac{h'(t)}{\sqrt{1+(h(t))^2}} dt$$

    The functions sin, cos, tan, etc. are defined using standard inverse
    or functional relations.

    The logarithm function assumes that $h$ has constant term $1$. All
    other functions assume that $h$ has constant term $0$.

    All functions assume that the coefficient $1/k$ or $1/k!$ exists
    for all indices $k$. When computing to order $O(x^n)$, the modulus $p$
    must therefore be a prime satisfying $p \ge n$. Further, we always
    require that $p > 2$ in order to be able to multiply by $1/2$ for
    internal purposes.

    If the input does not satisfy all these conditions, results are undefined.

    Except where otherwise noted, functions are implemented with optimal
    (up to constants) complexity $O(M(n))$, where $M(n)$ is the cost
    of polynomial multiplication.

*******************************************************************************

void _nmod_poly_log_series_monomial_ui(mp_ptr g,
            mp_limb_t c, ulong r, long n, nmod_t mod)

    Set $g = \log(1+cx^r) + O(x^n)$. Assumes $n > 0$, $r > 0$, and that
    the coefficient is reduced by the modulus. Works efficiently in linear
    time.

void nmod_poly_log_series_monomial_ui(nmod_poly_t g,
            mp_limb_t c, ulong r, long n)

    Set $g = \log(1+cx^r) + O(x^n)$. Works efficiently in linear time.

void _nmod_poly_log_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \log(h) + O(x^n)$. Assumes $n > 0$ and that $h$ is zero-padded
    as necessary to length $n$. Aliasing of $g$ and $h$ is allowed.

void nmod_poly_log_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \log(h) + O(x^n)$. The case $h = 1+cx^r$ is automatically
    detected and handled efficiently.

void _nmod_poly_exp_series_monomial_ui(mp_ptr g,
            mp_limb_t c, ulong r, long n, nmod_t mod)

    Set $g = \exp(cx^r) + O(x^n)$. Assumes $n > 0$, $r > 0$, and that
    the coefficient is reduced by the modulus. Works efficiently
    in linear time.

void nmod_poly_exp_series_monomial_ui(nmod_poly_t g,
            mp_limb_t c, ulong r, long n)

    Set $g = \exp(cx^r) + O(x^n)$. Works efficiently in linear time.

void _nmod_poly_exp_series_basecase(mp_ptr g, mp_srcptr h, long hlen,
                                        long n, nmod_t mod)

    Set $g = \exp(h) + O(x^n)$ using a simple $O(n^2)$ algorithm.
    Assumes $n > 0$ and $\operatorname{hlen} > 0$. Only the first
    $\operatorname{hlen}$ coefficients of $h$ will be read.
    Aliasing of $f$ and $h$ is allowed.

void nmod_poly_exp_series_basecase(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \exp(h) + O(x^n)$ using a simple $O(n^2)$ algorithm.

void _nmod_poly_exp_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \exp(h) + O(x^n)$. Assumes $n > 0$ and that $h$ is zero-padded
    as necessary to length $n$. Aliasing of $g$ and $h$ is not allowed.

    Uses Newton iteration (the version given in \cite{HanZim2004}).
    For small $n$, falls back to the basecase algorithm.

void  _nmod_poly_exp_expinv_series(mp_ptr f, mp_ptr g, mp_srcptr h,
        long n, nmod_t mod)

    Set $f = \exp(h) + O(x^n)$ and $g = \exp(-h) + O(x^n)$, more efficiently
    for large $n$ than performing a separate inversion to obtain $g$.
    Assumes $n > 0$ and that $h$ is zero-padded
    as necessary to length $n$. Aliasing is not allowed.

    Uses Newton iteration (the version given in \cite{HanZim2004}).
    For small $n$, falls back to the basecase algorithm.

void nmod_poly_exp_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \exp(h) + O(x^n)$. The case $h = cx^r$ is automatically
    detected and handled efficiently. Otherwise this function automatically
    uses the basecase algorithm for small $n$ and Newton iteration otherwise.

void _nmod_poly_atan_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \operatorname{atan}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_atan_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \operatorname{atan}(h) + O(x^n)$.

void _nmod_poly_atanh_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \operatorname{atanh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_atanh_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \operatorname{atanh}(h) + O(x^n)$.

void _nmod_poly_asin_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \operatorname{asin}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_asin_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \operatorname{asin}(h) + O(x^n)$.

void _nmod_poly_asinh_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \operatorname{asinh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed.

void nmod_poly_asinh_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \operatorname{asinh}(h) + O(x^n)$.

void _nmod_poly_sin_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \operatorname{sin}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed. The value is computed using the identity
    $\sin(x) = 2 \tan(x/2)) / (1 + \tan^2(x/2)).$

void nmod_poly_sin_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \operatorname{sin}(h) + O(x^n)$.

void _nmod_poly_cos_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \operatorname{cos}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    allowed. The value is computed using the identity
    $\cos(x) = (1-\tan^2(x/2)) / (1 + \tan^2(x/2)).$

void nmod_poly_cos_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \operatorname{cos}(h) + O(x^n)$.

void _nmod_poly_tan_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \operatorname{tan}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    not allowed. Uses Newton iteration to invert the atan function.

void nmod_poly_tan_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \operatorname{tan}(h) + O(x^n)$.

void _nmod_poly_sinh_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \operatorname{sinh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    not allowed. Uses the identity $\sinh(x) = (e^x - e^{-x})/2$.

void nmod_poly_sinh_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \operatorname{sinh}(h) + O(x^n)$.

void _nmod_poly_cosh_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \operatorname{cos}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Aliasing of $g$ and $h$ is
    not allowed. Uses the identity $\cosh(x) = (e^x + e^{-x})/2$.

void nmod_poly_cosh_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \operatorname{cosh}(h) + O(x^n)$.

void _nmod_poly_tanh_series(mp_ptr g, mp_srcptr h, long n, nmod_t mod)

    Set $g = \operatorname{tanh}(h) + O(x^n)$. Assumes $n > 0$ and that $h$
    is zero-padded as necessary to length $n$. Uses the identity
    $\tanh(x) = (e^{2x}-1)/(e^{2x}+1)$.

void nmod_poly_tanh_series(nmod_poly_t g, const nmod_poly_t h, long n)

    Set $g = \operatorname{tanh}(h) + O(x^n)$.

*******************************************************************************

    Products

*******************************************************************************

void _nmod_poly_product_roots_nmod_vec(mp_ptr poly, mp_srcptr xs,
    long n, nmod_t mod)

    Sets \code{(poly, n + 1)} to the monic polynomial which is the product
    of $(x - x_0)(x - x_1) \cdots (x - x_{n-1})$, the roots $x_i$ being
    given by \code{xs}.

    Aliasing of the input and output is not allowed.

void nmod_poly_product_roots_nmod_vec(nmod_poly_t poly, mp_srcptr xs, long n)

    Sets \code{poly} to the monic polynomial which is the product
    of $(x - x_0)(x - x_1) \cdots (x - x_{n-1})$, the roots $x_i$ being
    given by \code{xs}.

*******************************************************************************

    Subproduct trees

*******************************************************************************

mp_ptr * _nmod_poly_tree_alloc(long len)

    Allocates space for a subproduct tree of the given length, having
    linear factors at the lowest level.

    Entry $i$ in the tree is a pointer to a single array of limbs,
    capable of storing $\lfloor n / 2^i \rfloor$ subproducts of
    degree $2^i$ adjacently, plus a trailing entry if $n / 2^i$ is
    not an integer.

    For example, a tree of length 7 built from monic linear factors has
    the following structure, where spaces have been inserted
    for illustrative purposes:

    \begin{verbatim}
       X1 X1 X1 X1 X1 X1 X1
       XX1   XX1   XX1   X1
       XXXX1       XX1   X1
       XXXXXXX1
    \end{verbatim}

void _nmod_poly_tree_free(mp_ptr * tree, long len)

    Free the allocated space for the subproduct.

void _nmod_poly_tree_build(mp_ptr * tree, mp_srcptr roots, long len,
    nmod_t mod)

    Builds a subproduct tree in the preallocated space from
    the \code{len} monic linear factors $(x-r_i)$. The top level
    product is not computed.


*******************************************************************************

    Inflation and deflation

*******************************************************************************

void nmod_poly_inflate(nmod_poly_t result, const nmod_poly_t input,
    ulong inflation)

    Sets \code{result} to the inflated polynomial $p(x^n)$ where
    $p$ is given by \code{input} and $n$ is given by \code{deflation}.

void nmod_poly_deflate(nmod_poly_t result, const nmod_poly_t input,
    ulong deflation)

    Sets \code{result} to the deflated polynomial $p(x^{1/n})$ where
    $p$ is given by \code{input} and $n$ is given by \code{deflation}.
    Requires $n > 0$.

ulong nmod_poly_deflation(const nmod_poly_t input)

    Returns the largest integer by which \code{input} can be deflated.
    As special cases, returns 0 if \code{input} is the zero polynomial
    and 1 of \code{input} is a constant polynomial.

*******************************************************************************

    Factorisation

*******************************************************************************

void nmod_poly_factor_init(nmod_poly_factor_t fac)

    Initialises \code{fac} for use. An \code{nmod_poly_factor_t}
    represents a polynomial in factorised form as a product of
    polynomials with associated exponents.

void nmod_poly_factor_clear(nmod_poly_factor_t fac)

    Frees all memory associated with \code{fac}.

void nmod_poly_factor_realloc(nmod_poly_factor_t fac, long alloc)

    Reallocates the factor structure to provide space for 
    precisely \code{alloc} factors.

void nmod_poly_factor_fit_length(nmod_poly_factor_t fac, long len)

    Ensures that the factor structure has space for at 
    least \code{len} factors.  This functions takes care 
    of the case of repeated calls by always at least 
    doubling the number of factors the structure can hold.

void nmod_poly_factor_set(nmod_poly_factor_t res, const nmod_poly_factor_t fac)

    Sets \code{res} to the same factorisation as \code{fac}.

void nmod_poly_factor_print(const nmod_poly_factor_t fac)

    Prints the entries of \code{fac} to standard output.

void nmod_poly_factor_insert(nmod_poly_factor_t fac,
                             const nmod_poly_t poly, long exp)

    Inserts the factor \code{poly} with multiplicity \code{exp} into
    the factorisation \code{fac}.

    If \code{fac} already contains \code{poly}, then \code{exp} simply
    gets added to the exponent of the existing entry.

void nmod_poly_factor_concat(nmod_poly_factor_t res,
                             const nmod_poly_factor_t fac)

    Concatenates two factorisations.

    This is equivalent to calling \code{nmod_poly_factor_insert()} 
    repeatedly with the individual factors of \code{fac}.

    Does not support aliasing between \code{res} and \code{fac}.

void nmod_poly_factor_pow(nmod_poly_factor_t fac, long exp)

    Raises \code{fac} to the power \code{exp}.

ulong nmod_poly_remove(nmod_poly_t f, const nmod_poly_t p)

    Removes the highest possible power of \code{p} from \code{f} and
    returns the exponent.

int nmod_poly_is_irreducible(const nmod_poly_t f)

    Returns 1 if the polynomial \code{f} is irreducible, otherwise returns 0.

int _nmod_poly_is_squarefree(mp_srcptr f, long len, nmod_t mod)

    Returns 1 if \code{(f, len)} is squarefree, and 0 otherwise. As a
    special case, the zero polynomial is not considered squarefree.
    There are no restrictions on the length.

int nmod_poly_is_squarefree(nmod_poly_t f)

    Returns 1 if \code{f} is squarefree, and 0 otherwise. As a special
    case, the zero polynomial is not considered squarefree.

void nmod_poly_factor_squarefree(nmod_poly_factor_t res, const nmod_poly_t f)

    Sets \code{res} to a square-free factorization of \code{f}.

int nmod_poly_factor_equal_deg_prob(nmod_poly_t factor,
    flint_rand_t state, const nmod_poly_t pol, long d)

    Probabilistic equal degree factorisation of \code{pol} into
    irreducible factors of degree \code{d}. If it passes, a factor is
    placed in factor and 1 is returned, otherwise 0 is returned and
    the value of factor is undetermined.

    Requires that \code{pol} be monic, non-constant and squarefree.

void nmod_poly_factor_equal_deg(nmod_poly_factor_t factors,
                                const nmod_poly_t pol, long d)

    Assuming \code{pol} is a product of irreducible factors all of
    degree \code{d}, finds all those factors and places them in factors.
    Requires that \code{pol} be monic, non-constant and squarefree.

void nmod_poly_factor_cantor_zassenhaus(nmod_poly_factor_t res,
                                        const nmod_poly_t f)

    Factorises a non-constant polynomial \code{f} into monic irreducible
    factors using the Cantor-Zassenhaus algorithm.

void nmod_poly_factor_berlekamp(nmod_poly_factor_t res, const nmod_poly_t f)

    Factorises a non-constant, squarefree polynomial \code{f} into monic
    irreducible factors using the Berlekamp algorithm.

mp_limb_t nmod_poly_factor_with_berlekamp(nmod_poly_factor_t res,
                                          const nmod_poly_t f)

    Factorises a general polynomial \code{f} into monic irreducible factors
    and returns the leading coefficient of \code{f}, or 0 if \code{f}
    is the zero polynomial.

    This function first checks for small special cases, deflates \code{f}
    if it is of the form $p(x^m)$ for some $m > 1$, then performs a
    square-free factorisation, and finally runs Berlekamp on all the
    individual square-free factors.

mp_limb_t nmod_poly_factor_with_cantor_zassenhaus(nmod_poly_factor_t res,
                                                  const nmod_poly_t f)

    Factorises a general polynomial \code{f} into monic irreducible factors
    and returns the leading coefficient of \code{f}, or 0 if \code{f}
    is the zero polynomial.

    This function first checks for small special cases, deflates \code{f}
    if it is of the form $p(x^m)$ for some $m > 1$, then performs a
    square-free factorisation, and finally runs Cantor-Zassenhaus on all the
    individual square-free factors.

mp_limb_t nmod_poly_factor(nmod_poly_factor_t res, const nmod_poly_t f)

    Factorises a general polynomial \code{f} into monic irreducible factors
    and returns the leading coefficient of \code{f}, or 0 if \code{f}
    is the zero polynomial.

    This function first checks for small special cases, deflates \code{f}
    if it is of the form $p(x^m)$ for some $m > 1$, then performs a
    square-free factorisation, and finally runs either Cantor-Zassenhaus
    or Berlekamp on all the individual square-free factors.
    Currently Cantor-Zassenhaus is used by default unless the modulus is 2, in
    which case Berlekamp is used.

