diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index 205f6c1a24e..5dbc1ff91f3 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -75,5 +75,9 @@ jobs:
           mode: instrumentation
           run: |
             echo "Running benchmarks for ${{ matrix.benchmark-target.package }}"
-            cargo codspeed run -p ${{ matrix.benchmark-target.package }} > /dev/null
+            if [[ "${{ matrix.benchmark-target.package }}" == "uu_factor" ]]; then
+              cargo codspeed run -p ${{ matrix.benchmark-target.package }}
+            else
+              cargo codspeed run -p ${{ matrix.benchmark-target.package }} > /dev/null
+            fi
           token: ${{ secrets.CODSPEED_TOKEN }}
diff --git a/.vscode/cspell.dictionaries/jargon.wordlist.txt b/.vscode/cspell.dictionaries/jargon.wordlist.txt
index bd29bd2461d..2c08907167e 100644
--- a/.vscode/cspell.dictionaries/jargon.wordlist.txt
+++ b/.vscode/cspell.dictionaries/jargon.wordlist.txt
@@ -164,6 +164,8 @@ wordlists
 xattrs
 xpass
 
+jemalloc
+jemallocator
 # * abbreviations
 AMPM
 ampm
diff --git a/Cargo.lock b/Cargo.lock
index 5781d4e32fd..5a81bcf38da 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1563,6 +1563,37 @@ version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
 
+[[package]]
+name = "jemalloc-ctl"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7cffc705424a344c054e135d12ee591402f4539245e8bbd64e6c9eaa9458b63c"
+dependencies = [
+ "jemalloc-sys",
+ "libc",
+ "paste",
+]
+
+[[package]]
+name = "jemalloc-sys"
+version = "0.5.4+5.3.0-patched"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2"
+dependencies = [
+ "cc",
+ "libc",
+]
+
+[[package]]
+name = "jemallocator"
+version = "0.5.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc"
+dependencies = [
+ "jemalloc-sys",
+ "libc",
+]
+
 [[package]]
 name = "jiff"
 version = "0.2.16"
@@ -2040,6 +2071,12 @@ dependencies = [
  "winnow",
 ]
 
+[[package]]
+name = "paste"
+version = "1.0.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
+
 [[package]]
 name = "phf"
 version = "0.13.1"
@@ -3317,6 +3354,8 @@ dependencies = [
  "clap",
  "codspeed-divan-compat",
  "fluent",
+ "jemalloc-ctl",
+ "jemallocator",
  "num-bigint",
  "num-prime",
  "num-traits",
diff --git a/Cargo.toml b/Cargo.toml
index b388373a2aa..7a50cb21a8d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -331,6 +331,8 @@ icu_locale = "2.0.0"
 icu_provider = "2.0.0"
 indicatif = "0.18.0"
 itertools = "0.14.0"
+jemalloc-ctl = "0.5"
+jemallocator = "0.5"
 jiff = { version = "0.2.10", default-features = false, features = [
   "std",
   "alloc",
@@ -347,6 +349,7 @@ nix = { version = "0.30", default-features = false }
 nom = "8.0.0"
 notify = { version = "=8.2.0", features = ["macos_kqueue"] }
 num-bigint = "0.4.4"
+num-integer = "0.1"
 num-prime = "0.4.4"
 num-traits = "0.2.19"
 onig = { version = "~6.5.1", default-features = false }
diff --git a/src/uu/factor/Cargo.toml b/src/uu/factor/Cargo.toml
index ef672bf9308..09b659e59cd 100644
--- a/src/uu/factor/Cargo.toml
+++ b/src/uu/factor/Cargo.toml
@@ -34,6 +34,10 @@ divan = { workspace = true }
 rand = { workspace = true }
 uucore = { workspace = true, features = ["benchmark"] }
 
+[target.'cfg(any(target_os = "linux", target_os = "macos", target_os = "freebsd"))'.dev-dependencies]
+jemalloc-ctl = { workspace = true }
+jemallocator = { workspace = true }
+
 [lib]
 path = "src/factor.rs"
 
diff --git a/src/uu/factor/benches/factor_bench.rs b/src/uu/factor/benches/factor_bench.rs
index 952ea09a616..93149eab694 100644
--- a/src/uu/factor/benches/factor_bench.rs
+++ b/src/uu/factor/benches/factor_bench.rs
@@ -9,6 +9,30 @@ use divan::{Bencher, black_box};
 use uu_factor::uumain;
 use uucore::benchmark::run_util_function;
 
+#[cfg(any(target_os = "linux", target_os = "macos", target_os = "freebsd"))]
+use jemallocator::Jemalloc;
+
+#[cfg(any(target_os = "linux", target_os = "macos", target_os = "freebsd"))]
+#[global_allocator]
+static ALLOC: Jemalloc = Jemalloc;
+
+#[cfg(any(target_os = "linux", target_os = "macos", target_os = "freebsd"))]
+fn log_jemalloc_stats(label: &str) {
+    use jemalloc_ctl::{epoch, stats};
+
+    epoch::advance().unwrap();
+    let allocated = stats::allocated::read().unwrap();
+    let resident = stats::resident::read().unwrap();
+
+    println!(
+        "jemalloc {label}: allocated={} bytes, resident={} bytes",
+        allocated, resident
+    );
+}
+
+#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "freebsd")))]
+fn log_jemalloc_stats(_label: &str) {}
+
 /// Benchmark multiple u64 digits
 #[divan::bench(args = [(2)])]
 fn factor_multiple_u64s(bencher: Bencher, start_num: u64) {
@@ -16,9 +40,11 @@ fn factor_multiple_u64s(bencher: Bencher, start_num: u64) {
         // this is a range of 5000 different u128 integers
         .with_inputs(|| (start_num, start_num + 2500))
         .bench_values(|(start_u64, end_u64)| {
+            log_jemalloc_stats("before factor_multiple_u64s");
             for u64_digit in start_u64..=end_u64 {
                 black_box(run_util_function(uumain, &[&u64_digit.to_string()]));
             }
+            log_jemalloc_stats("after factor_multiple_u64s");
         });
 }
 
diff --git a/src/uu/factor/src/factor.rs b/src/uu/factor/src/factor.rs
index 15af962d659..2fff44b99bb 100644
--- a/src/uu/factor/src/factor.rs
+++ b/src/uu/factor/src/factor.rs
@@ -3,7 +3,14 @@
 // For the full copyright and license information, please view the LICENSE
 // file that was distributed with this source code.
 
-// spell-checker:ignore funcs
+// spell-checker:ignore funcs biguint modpow unfactored
+// NOTE:
+//   For BigUint > u128, this implementation attempts factorization using Miller-Rabin,
+//   an improved Pollard-rho, and p-1.
+//   However, compared to GNU factor, there may still be differences in performance
+//   and success rate.
+//   To further approach GNU factor behavior, additional algorithms (e.g. ECM)
+//   and parameter tuning may be required.
 
 use std::collections::BTreeMap;
 use std::io::BufRead;
@@ -11,9 +18,9 @@ use std::io::{self, Write, stdin, stdout};
 
 use clap::{Arg, ArgAction, Command};
 use num_bigint::BigUint;
-use num_traits::FromPrimitive;
+use num_traits::{FromPrimitive, One, ToPrimitive, Zero};
 use uucore::display::Quotable;
-use uucore::error::{FromIo, UResult, USimpleError, set_exit_code};
+use uucore::error::{FromIo, UResult, set_exit_code};
 use uucore::translate;
 use uucore::{format_usage, show_error, show_warning};
 
@@ -28,52 +35,55 @@ fn print_factors_str(
     w: &mut io::BufWriter<impl Write>,
     print_exponents: bool,
 ) -> UResult<()> {
-    let rx = num_str.trim().parse::<BigUint>();
-    let Ok(x) = rx else {
-        // return Ok(). it's non-fatal and we should try the next number.
-        show_warning!("{}: {}", num_str.maybe_quote(), rx.unwrap_err());
+    let s = num_str.trim();
+
+    // First, interpret as BigUint.
+    let rx_big = s.parse::<BigUint>();
+    let Ok(x_big) = rx_big else {
+        // Non-fatal error. Proceed to the next input number.
+        show_warning!("{}: {}", num_str.maybe_quote(), rx_big.unwrap_err());
         set_exit_code(1);
         return Ok(());
     };
 
-    if x > BigUint::from_u32(1).unwrap() {
-        // use num_prime's factorize64 algorithm for u64 integers
-        if x <= BigUint::from_u64(u64::MAX).unwrap() {
-            let prime_factors = num_prime::nt_funcs::factorize64(x.clone().to_u64_digits()[0]);
-            write_result_u64(w, &x, prime_factors, print_exponents)
-                .map_err_context(|| translate!("factor-error-write-error"))?;
-        }
-        // use num_prime's factorize128 algorithm for u128 integers
-        else if x <= BigUint::from_u128(u128::MAX).unwrap() {
-            let rx = num_str.trim().parse::<u128>();
-            let Ok(x) = rx else {
-                // return Ok(). it's non-fatal and we should try the next number.
-                show_warning!("{}: {}", num_str.maybe_quote(), rx.unwrap_err());
-                set_exit_code(1);
-                return Ok(());
-            };
-            let prime_factors = num_prime::nt_funcs::factorize128(x);
-            write_result_u128(w, &x, prime_factors, print_exponents)
-                .map_err_context(|| translate!("factor-error-write-error"))?;
-        }
-        // use num_prime's fallible factorization for anything greater than u128::MAX
-        else {
-            let (prime_factors, remaining) = num_prime::nt_funcs::factors(x.clone(), None);
-            if let Some(_remaining) = remaining {
-                return Err(USimpleError::new(
-                    1,
-                    translate!("factor-error-factorization-incomplete"),
-                ));
-            }
-            write_result_big_uint(w, &x, prime_factors, print_exponents)
-                .map_err_context(|| translate!("factor-error-write-error"))?;
-        }
-    } else {
+    if x_big <= BigUint::from_u32(1).unwrap() {
+        // For values <= 1: as in GNU factor, print the input with no prime factors.
         let empty_primes: BTreeMap<BigUint, usize> = BTreeMap::new();
-        write_result_big_uint(w, &x, empty_primes, print_exponents)
+        write_result_big_uint(w, &x_big, empty_primes, print_exponents)
+            .map_err_context(|| translate!("factor-error-write-error"))?;
+        return Ok(());
+    }
+
+    // Try parsing directly into u64 / u128 and delegate to num_prime if successful.
+    // This avoids unnecessary BigUint conversions and speeds up the common cases.
+    if let Ok(v) = s.parse::<u64>() {
+        let prime_factors = num_prime::nt_funcs::factorize64(v);
+        write_result_u64(w, &x_big, prime_factors, print_exponents)
             .map_err_context(|| translate!("factor-error-write-error"))?;
+        return Ok(());
+    }
+
+    if let Ok(v) = s.parse::<u128>() {
+        let prime_factors = num_prime::nt_funcs::factorize128(v);
+        write_result_u128(w, &v, prime_factors, print_exponents)
+            .map_err_context(|| translate!("factor-error-write-error"))?;
+        return Ok(());
+    }
+
+    // For BigUint > u128: use our own recursive factorization based on
+    // Miller-Rabin + Pollard-rho + p-1.
+    let mut factors: Vec<BigUint> = Vec::new();
+    let success = factor_biguint_recursive(&x_big, &mut factors);
+
+    if !success {
+        // Only set exit code=1 when complete factorization could not be achieved
+        set_exit_code(1);
     }
 
+    let factorization = collect_biguint_factors(&factors);
+    write_result_big_uint(w, &x_big, factorization, print_exponents)
+        .map_err_context(|| translate!("factor-error-write-error"))?;
+
     Ok(())
 }
 
@@ -123,6 +133,362 @@ fn write_result_u128(
     w.flush()
 }
 
+fn collect_biguint_factors(factors: &[BigUint]) -> BTreeMap<BigUint, usize> {
+    let mut map = BTreeMap::<BigUint, usize>::new();
+    for f in factors {
+        *map.entry(f.clone()).or_insert(0) += 1;
+    }
+    map
+}
+
+fn is_even(value: &BigUint) -> bool {
+    (value & BigUint::one()).is_zero()
+}
+
+fn is_probable_prime(candidate: &BigUint) -> bool {
+    if *candidate < BigUint::from_u32(2).unwrap() {
+        return false;
+    }
+    if *candidate == BigUint::from_u32(2).unwrap() || *candidate == BigUint::from_u32(3).unwrap() {
+        return true;
+    }
+    // even check: candidate % 2 == 0
+    if is_even(candidate) {
+        return false;
+    }
+
+    let one = BigUint::one();
+    let two = BigUint::from_u32(2).unwrap();
+
+    // candidate - 1 = odd_component * 2^power_of_two
+    let mut odd_component = candidate - &one;
+    let mut power_of_two = 0u32;
+    // while odd_component is even
+    while is_even(&odd_component) {
+        odd_component >>= 1;
+        power_of_two += 1;
+    }
+
+    let bases_32: [u64; 3] = [2, 7, 61];
+    let bases_64: [u64; 12] = [
+        2, 325, 9375, 28178, 450775, 9780504, 1795265022, 3, 5, 7, 11, 13,
+    ];
+
+    let bases: Vec<u64> = if candidate.bits() <= 32 {
+        bases_32.to_vec()
+    } else if candidate.bits() <= 64 {
+        bases_64.to_vec()
+    } else {
+        vec![2, 3, 5, 7, 11, 13, 17, 19, 23]
+    };
+
+    'outer: for &base_value in &bases {
+        if BigUint::from(base_value) >= *candidate {
+            continue;
+        }
+        let base = BigUint::from(base_value);
+        let mut witness = base.modpow(&odd_component, candidate);
+        if witness == one || witness == candidate - &one {
+            continue 'outer;
+        }
+
+        for _ in 1..power_of_two {
+            witness = witness.modpow(&two, candidate);
+            if witness == candidate - &one {
+                continue 'outer;
+            }
+            if witness == one {
+                return false;
+            }
+        }
+        return false;
+    }
+
+    true
+}
+
+fn small_trial_division(n: &BigUint) -> Option<BigUint> {
+    // Quickly strip very small prime factors before applying expensive algorithms.
+    // This is intentionally lightweight while still covering a reasonably wide range.
+    // GNU factor maintains a large trial table; here we mimic only a small portion of it.
+    //
+    // NOTE: By removing many small prime factors here, we significantly reduce the
+    //       search space and failure count of subsequent Pollard-rho / p-1 steps.
+    const SMALL_PRIMES: [u16; 54] = [
+        2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89,
+        97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, 157, 163, 167, 173, 179, 181,
+        191, 193, 197, 199, 211, 223, 227, 229, 233, 239, 241, 251,
+    ];
+
+    for &p in &SMALL_PRIMES {
+        let p_big = BigUint::from_u32(p as u32).unwrap();
+        if n == &p_big {
+            return None;
+        }
+        if (n % &p_big).is_zero() {
+            return Some(p_big);
+        }
+    }
+    None
+}
+
+/// Simplified Pollard p-1 method (Stage 1 only).
+/// Effective when p-1 (for a prime divisor p of n) is smooth with small prime factors.
+fn pollard_p_minus_1(n: &BigUint) -> Option<BigUint> {
+    // Stage 1 only (simplified).
+    // Best-effort: we do not spend too long here; give up quickly if it does not help.
+    let one = BigUint::one();
+    let two = BigUint::from_u32(2).unwrap();
+
+    if n.is_zero() || n.is_one() {
+        return None;
+    }
+
+    if is_even(n) {
+        return Some(two);
+    }
+
+    let bits = n.bits();
+
+    // Keep B1 relatively small to avoid excessive cost.
+    // (GNU factor adjusts based on input and retries; we use a fixed approximation.)
+    let b1: u64 = if bits <= 256 {
+        10_000
+    } else if bits <= 512 {
+        20_000
+    } else {
+        50_000
+    };
+
+    // Only try a few small prime bases.
+    const BASES: [u64; 3] = [2, 3, 5];
+
+    for &base in &BASES {
+        let mut a = BigUint::from(base);
+        if &a >= n {
+            continue;
+        }
+
+        let mut g = gcd_biguint(&a, n);
+        if g > one && &g < n {
+            return Some(g);
+        }
+
+        // Construct a^(M) step by step (approximating by extending the exponent with 2^k)
+        let mut e = 2u64;
+        while e <= b1 {
+            a = a.modpow(&BigUint::from_u64(e).unwrap(), n);
+            if a.is_one() {
+                break;
+            }
+            let am1 = if a > one { &a - &one } else { continue };
+            g = gcd_biguint(&am1, n);
+            if g > one && &g < n {
+                return Some(g);
+            }
+            e <<= 1;
+        }
+    }
+
+    None
+}
+
+/// Improved Pollard-rho (Brent variant with batched gcd).
+/// Not equivalent to GNU factor, but aims for better convergence and performance
+/// than a naive implementation.
+fn pollard_rho(composite: &BigUint) -> Option<BigUint> {
+    // NOTE:
+    //  - This implementation is inspired by the approach in GNU factor but simplified.
+    //  - For large inputs we avoid running too long; we cap the iterations so that
+    //    we do not spend many seconds on hopeless cases.
+    //  - If factorization fails, we return "Factorization incomplete"-style results.
+    let one = BigUint::one();
+    let two = BigUint::from_u32(2).unwrap();
+
+    // For small n we expect earlier code paths to have handled the input.
+    if *composite <= BigUint::from_u32(3).unwrap() {
+        return None;
+    }
+
+    // If composite is even, return 2 immediately.
+    if is_even(composite) {
+        return Some(two);
+    }
+
+    // Use a deterministic LCG to generate parameter sequences.
+    const LCG_MULTIPLIER: u128 = 6364136223846793005;
+    const LCG_INCREMENT: u128 = 1442695040888963407;
+
+    fn lcg_next(x: &mut u128) {
+        *x = x.wrapping_mul(LCG_MULTIPLIER).wrapping_add(LCG_INCREMENT);
+    }
+
+    let bits = composite.bits();
+
+    // Search parameters: choose bounds based on bit length.
+    // Avoid overly large limits; when exhausted, treat as failure to find a factor.
+    let max_tries: u64 = 16;
+    let max_iter: u64 = (bits * bits).clamp(10_000, 200_000);
+
+    const LCG_DEFAULT_SEED: u128 = 0x9e3779b97f4a7c15;
+    let mut seed: u128 = LCG_DEFAULT_SEED;
+
+    for _try in 0..max_tries {
+        lcg_next(&mut seed);
+        let mut x_state = BigUint::from(seed % (u128::MAX / 2 + 1));
+        lcg_next(&mut seed);
+        let mut constant = BigUint::from(seed % (u128::MAX / 2 + 1));
+        if constant.is_zero() {
+            constant = BigUint::from(1u32);
+        }
+        x_state %= composite;
+        constant %= composite;
+
+        let mut y_state = x_state.clone();
+        let mut current_gcd = one.clone();
+        let mut product = one.clone();
+
+        let mut iter: u64 = 0;
+        let batch_size: u64 = 128;
+
+        while current_gcd == one && iter < max_iter {
+            // Brent variant: use batched gcd.
+            let mut batch_iter = 0;
+            let x_saved = x_state.clone();
+            while batch_iter < batch_size && iter < max_iter {
+                // f(z) = z^2 + c mod composite.
+                y_state = (&y_state * &y_state + &constant) % composite;
+                let diff = if x_state > y_state {
+                    &x_state - &y_state
+                } else {
+                    &y_state - &x_state
+                };
+                if !diff.is_zero() {
+                    product = (product * diff) % composite;
+                }
+                batch_iter += 1;
+                iter += 1;
+            }
+            current_gcd = gcd_biguint(&product, composite);
+            x_state = x_saved;
+
+            if current_gcd == one {
+                // Update x_state to advance the sequence.
+                x_state.clone_from(&y_state);
+            }
+        }
+
+        if current_gcd == one {
+            continue;
+        }
+        if &current_gcd == composite {
+            // Fallback: step-by-step gcd.
+            let mut z_state = x_state.clone();
+            loop {
+                z_state = (&z_state * &z_state + &constant) % composite;
+                let diff = if z_state > y_state {
+                    &z_state - &y_state
+                } else {
+                    &y_state - &z_state
+                };
+                current_gcd = gcd_biguint(&diff, composite);
+                if current_gcd > one || z_state == y_state {
+                    break;
+                }
+            }
+        }
+
+        if current_gcd > one && &current_gcd < composite {
+            return Some(current_gcd);
+        }
+    }
+
+    None
+}
+
+/// Recursively factor n and append factors (primes or unfactored composites) to `factors`.
+/// Returns true if full factorization succeeded, false otherwise.
+fn gcd_biguint(lhs: &BigUint, rhs: &BigUint) -> BigUint {
+    // Standard Euclidean algorithm using owned BigUint values to avoid lifetime issues.
+    let mut dividend = lhs.clone();
+    let mut divisor = rhs.clone();
+    while !divisor.is_zero() {
+        let remainder = &dividend % &divisor;
+        dividend = divisor;
+        divisor = remainder;
+    }
+    dividend
+}
+
+fn factor_biguint_recursive(n: &BigUint, factors: &mut Vec<BigUint>) -> bool {
+    let one = BigUint::one();
+    if *n <= one {
+        return true;
+    }
+
+    // First remove small prime factors, then apply more expensive methods.
+    if let Some(p) = small_trial_division(n) {
+        let mut q = n.clone();
+        let mut ok = true;
+        while (&q % &p).is_zero() {
+            q /= &p;
+            factors.push(p.clone());
+        }
+        if !q.is_one() {
+            ok &= factor_biguint_recursive(&q, factors);
+        }
+        return ok;
+    }
+
+    // If n is small enough, use num_prime's factorize128 for speed.
+    if n.bits() <= 128 {
+        if let Some(x128) = n.to_u128() {
+            let pf = num_prime::nt_funcs::factorize128(x128);
+            if !pf.is_empty() {
+                for (p, e) in pf {
+                    for _ in 0..e {
+                        factors.push(BigUint::from(p));
+                    }
+                }
+                return true;
+            }
+        }
+    }
+
+    if is_probable_prime(n) {
+        factors.push(n.clone());
+        return true;
+    }
+
+    // Try Pollard p-1 first (simplified Stage 1).
+    if let Some(f) = pollard_p_minus_1(n) {
+        if f.is_one() || &f == n {
+            // Treat as failure.
+        } else {
+            let q = n / &f;
+            let left_ok = factor_biguint_recursive(&f, factors);
+            let right_ok = factor_biguint_recursive(&q, factors);
+            return left_ok && right_ok;
+        }
+    }
+
+    // Then try improved Pollard-rho (Brent variant).
+    if let Some(f) = pollard_rho(n) {
+        if f.is_one() || &f == n {
+            factors.push(n.clone());
+            return false;
+        }
+        let q = n / &f;
+        let left_ok = factor_biguint_recursive(&f, factors);
+        let right_ok = factor_biguint_recursive(&q, factors);
+        return left_ok && right_ok;
+    }
+
+    // If no factor was found, include n itself as part of the (incomplete) factorization.
+    factors.push(n.clone());
+    false
+}
+
 /// Writing out the prime factors for BigUint integers
 fn write_result_big_uint(
     w: &mut io::BufWriter<impl Write>,