use crate::cpu_features::CpuFeatureLevel;
use crate::util::*;
use super::TxType;
cfg_if::cfg_if! {
if #[cfg(nasm_x86_64)] {
pub use crate::asm::x86::transform::forward::*;
} else if #[cfg(asm_neon)] {
pub use crate::asm::aarch64::transform::forward::*;
} else {
pub use self::rust::*;
}
}
pub mod rust {
use super::*;
use std::mem::MaybeUninit;
use crate::transform::forward_shared::*;
use crate::transform::{av1_round_shift_array, valid_av1_transform, TxSize};
use simd_helpers::cold_for_target_arch;
type TxfmFunc = fn(&mut [i32]);
impl_1d_tx!();
impl TxOperations for i32 {
fn zero() -> Self {
0
}
fn tx_mul<const SHIFT: i32>(self, mul: i32) -> Self {
((self * mul) + (1 << SHIFT >> 1)) >> SHIFT
}
fn rshift1(self) -> Self {
(self + i32::from(self < 0)) >> 1
}
fn add(self, b: Self) -> Self {
self + b
}
fn sub(self, b: Self) -> Self {
self - b
}
fn add_avg(self, b: Self) -> Self {
(self + b) >> 1
}
fn sub_avg(self, b: Self) -> Self {
(self - b) >> 1
}
}
#[cold_for_target_arch("x86_64")]
pub fn forward_transform<T: Coefficient>(
input: &[i16], output: &mut [MaybeUninit<T>], stride: usize,
tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
) {
assert!(valid_av1_transform(tx_size, tx_type));
let txfm_size_col = tx_size.width();
let txfm_size_row = tx_size.height();
let mut buf = Aligned::<[MaybeUninit<i32>; 64 * 64]>::uninit_array();
let buf = &mut buf.data[..txfm_size_col * txfm_size_row];
let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd);
let txfm_func_col = get_func(cfg.txfm_type_col);
let txfm_func_row = get_func(cfg.txfm_type_row);
for c in 0..txfm_size_col {
let mut col_coeffs = Aligned::<[MaybeUninit<i32>; 64]>::uninit_array();
let col_coeffs = &mut col_coeffs.data[..txfm_size_row];
if cfg.ud_flip {
for r in 0..txfm_size_row {
col_coeffs[r]
.write((input[(txfm_size_row - r - 1) * stride + c]).into());
}
} else {
for r in 0..txfm_size_row {
col_coeffs[r].write((input[r * stride + c]).into());
}
}
let col_coeffs = unsafe { slice_assume_init_mut(col_coeffs) };
av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[0]);
txfm_func_col(col_coeffs);
av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[1]);
if cfg.lr_flip {
for r in 0..txfm_size_row {
buf[r * txfm_size_col + (txfm_size_col - c - 1)]
.write(col_coeffs[r]);
}
} else {
for r in 0..txfm_size_row {
buf[r * txfm_size_col + c].write(col_coeffs[r]);
}
}
}
let buf = unsafe { slice_assume_init_mut(buf) };
for (r, row_coeffs) in buf.chunks_exact_mut(txfm_size_col).enumerate() {
txfm_func_row(row_coeffs);
av1_round_shift_array(row_coeffs, txfm_size_col, -cfg.shift[2]);
let output_stride = txfm_size_row.min(32);
let output = &mut output
[(r >= 32) as usize * output_stride * txfm_size_col.min(32)..];
for cg in (0..txfm_size_col).step_by(32) {
let output = &mut output[txfm_size_row * cg..];
for c in 0..txfm_size_col.min(32) {
output[c * output_stride + (r & 31)]
.write(T::cast_from(row_coeffs[c + cg]));
}
}
}
}
}