From 3ad3bbc639184b6637ad5700ed7e3dfade4c2f19 Mon Sep 17 00:00:00 2001 From: Siina Mashek Date: Tue, 3 Dec 2024 18:06:55 +0200 Subject: [PATCH] removing patches, don't use anymore --- ...sallow-unprivileged-CLONE_NEWUSER-by.patch | 151 - .../sys-kernel/gentoo-sources/0002-bbr3.patch | 3386 --- .../gentoo-sources/0003-glitched-base.patch | 822 - .../gentoo-sources/0003-glitched-cfs.patch | 117 - .../sys-kernel/gentoo-sources/0005-ksm.patch | 433 - .../0006-add-acs-overrides_iommu.patch | 193 - ...7-v6.12-fsync_legacy_via_futex_waitv.patch | 166 - .../sys-kernel/gentoo-sources/0008-zstd.patch | 18652 ---------------- .../gentoo-sources/v4l2loopback.patch | 3767 ---- 9 files changed, 27687 deletions(-) delete mode 100644 patches/sys-kernel/gentoo-sources/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch delete mode 100644 patches/sys-kernel/gentoo-sources/0002-bbr3.patch delete mode 100644 patches/sys-kernel/gentoo-sources/0003-glitched-base.patch delete mode 100644 patches/sys-kernel/gentoo-sources/0003-glitched-cfs.patch delete mode 100644 patches/sys-kernel/gentoo-sources/0005-ksm.patch delete mode 100644 patches/sys-kernel/gentoo-sources/0006-add-acs-overrides_iommu.patch delete mode 100644 patches/sys-kernel/gentoo-sources/0007-v6.12-fsync_legacy_via_futex_waitv.patch delete mode 100644 patches/sys-kernel/gentoo-sources/0008-zstd.patch delete mode 100644 patches/sys-kernel/gentoo-sources/v4l2loopback.patch diff --git a/patches/sys-kernel/gentoo-sources/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch b/patches/sys-kernel/gentoo-sources/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch deleted file mode 100644 index 30a77c7..0000000 --- a/patches/sys-kernel/gentoo-sources/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch +++ /dev/null @@ -1,151 +0,0 @@ -From d50977b164e708bf523a35ef53315355528c3ca6 Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" -Date: Mon, 16 Sep 2019 04:53:20 +0200 -Subject: [PATCH] ZEN: Add sysctl and CONFIG to disallow unprivileged - CLONE_NEWUSER - -Our default behavior continues to match the vanilla kernel. ---- - include/linux/user_namespace.h | 4 ++++ - init/Kconfig | 16 ++++++++++++++++ - kernel/fork.c | 14 ++++++++++++++ - kernel/sysctl.c | 12 ++++++++++++ - kernel/user_namespace.c | 7 +++++++ - 5 files changed, 53 insertions(+) - -diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h -index 45f09bec02c485..87b20e2ee27445 100644 ---- a/include/linux/user_namespace.h -+++ b/include/linux/user_namespace.h -@@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, - - #ifdef CONFIG_USER_NS - -+extern int unprivileged_userns_clone; -+ - static inline struct user_namespace *get_user_ns(struct user_namespace *ns) - { - if (ns) -@@ -181,6 +183,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns); - struct ns_common *ns_get_owner(struct ns_common *ns); - #else - -+#define unprivileged_userns_clone 0 -+ - static inline struct user_namespace *get_user_ns(struct user_namespace *ns) - { - return &init_user_ns; -diff --git a/init/Kconfig b/init/Kconfig -index 94125d3b6893c7..9f7139b536f638 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1247,6 +1247,22 @@ config USER_NS - - If unsure, say N. - -+config USER_NS_UNPRIVILEGED -+ bool "Allow unprivileged users to create namespaces" -+ default y -+ depends on USER_NS -+ help -+ When disabled, unprivileged users will not be able to create -+ new namespaces. Allowing users to create their own namespaces -+ has been part of several recent local privilege escalation -+ exploits, so if you need user namespaces but are -+ paranoid^Wsecurity-conscious you want to disable this. -+ -+ This setting can be overridden at runtime via the -+ kernel.unprivileged_userns_clone sysctl. -+ -+ If unsure, say Y. -+ - config PID_NS - bool "PID Namespaces" - default y -diff --git a/kernel/fork.c b/kernel/fork.c -index 08969f5aa38d59..ff601cb7a1fae0 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -98,6 +98,10 @@ - #include - #include - -+#ifdef CONFIG_USER_NS -+#include -+#endif -+ - #include - #include - #include -@@ -2008,6 +2012,10 @@ static __latent_entropy struct task_struct *copy_process( - if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) - return ERR_PTR(-EINVAL); - -+ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) -+ if (!capable(CAP_SYS_ADMIN)) -+ return ERR_PTR(-EPERM); -+ - /* - * Thread groups must share signals as well, and detached threads - * can only be started up within the thread group. -@@ -3166,6 +3174,12 @@ int ksys_unshare(unsigned long unshare_flags) - if (unshare_flags & CLONE_NEWNS) - unshare_flags |= CLONE_FS; - -+ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { -+ err = -EPERM; -+ if (!capable(CAP_SYS_ADMIN)) -+ goto bad_unshare_out; -+ } -+ - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index c6d9dec11b749d..9a4514ad481b21 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -81,6 +81,9 @@ - #ifdef CONFIG_RT_MUTEXES - #include - #endif -+#ifdef CONFIG_USER_NS -+#include -+#endif - - /* shared constants to be used in various sysctls */ - const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; -@@ -1659,6 +1662,15 @@ static struct ctl_table kern_table[] = { - .mode = 0644, - .proc_handler = proc_dointvec, - }, -+#ifdef CONFIG_USER_NS -+ { -+ .procname = "unprivileged_userns_clone", -+ .data = &unprivileged_userns_clone, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec, -+ }, -+#endif - #ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 54211dbd516c57..16ca0c1516298d 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -22,6 +22,13 @@ - #include - #include - -+/* sysctl */ -+#ifdef CONFIG_USER_NS_UNPRIVILEGED -+int unprivileged_userns_clone = 1; -+#else -+int unprivileged_userns_clone; -+#endif -+ - static struct kmem_cache *user_ns_cachep __ro_after_init; - static DEFINE_MUTEX(userns_state_mutex); - diff --git a/patches/sys-kernel/gentoo-sources/0002-bbr3.patch b/patches/sys-kernel/gentoo-sources/0002-bbr3.patch deleted file mode 100644 index f9453a9..0000000 --- a/patches/sys-kernel/gentoo-sources/0002-bbr3.patch +++ /dev/null @@ -1,3386 +0,0 @@ -From 3e76598e1f8e7468aa7ceaee2e1ebc4683c03ed5 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Mon, 7 Oct 2024 11:53:09 +0200 -Subject: [PATCH 2/8] bbr3 - -Signed-off-by: Peter Jung ---- - include/linux/tcp.h | 4 +- - include/net/inet_connection_sock.h | 4 +- - include/net/tcp.h | 72 +- - include/uapi/linux/inet_diag.h | 23 + - include/uapi/linux/rtnetlink.h | 4 +- - include/uapi/linux/tcp.h | 1 + - net/ipv4/Kconfig | 21 +- - net/ipv4/bpf_tcp_ca.c | 9 +- - net/ipv4/tcp.c | 3 + - net/ipv4/tcp_bbr.c | 2230 +++++++++++++++++++++------- - net/ipv4/tcp_cong.c | 1 + - net/ipv4/tcp_input.c | 40 +- - net/ipv4/tcp_minisocks.c | 2 + - net/ipv4/tcp_output.c | 48 +- - net/ipv4/tcp_rate.c | 30 +- - net/ipv4/tcp_timer.c | 1 + - 16 files changed, 1940 insertions(+), 553 deletions(-) - -diff --git a/include/linux/tcp.h b/include/linux/tcp.h -index 6a5e08b937b3..27aab715490e 100644 ---- a/include/linux/tcp.h -+++ b/include/linux/tcp.h -@@ -369,7 +369,9 @@ struct tcp_sock { - u8 compressed_ack; - u8 dup_ack_counter:2, - tlp_retrans:1, /* TLP is a retransmission */ -- unused:5; -+ fast_ack_mode:2, /* which fast ack mode ? */ -+ tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */ -+ unused:2; - u8 thin_lto : 1,/* Use linear timeouts for thin streams */ - fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ - fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ -diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h -index c0deaafebfdc..d53f042d936e 100644 ---- a/include/net/inet_connection_sock.h -+++ b/include/net/inet_connection_sock.h -@@ -137,8 +137,8 @@ struct inet_connection_sock { - u32 icsk_probes_tstamp; - u32 icsk_user_timeout; - -- u64 icsk_ca_priv[104 / sizeof(u64)]; --#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) -+#define ICSK_CA_PRIV_SIZE (144) -+ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; - }; - - #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ -diff --git a/include/net/tcp.h b/include/net/tcp.h -index d1948d357dad..7d99f0bec5f2 100644 ---- a/include/net/tcp.h -+++ b/include/net/tcp.h -@@ -375,6 +375,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) - #define TCP_ECN_QUEUE_CWR 2 - #define TCP_ECN_DEMAND_CWR 4 - #define TCP_ECN_SEEN 8 -+#define TCP_ECN_LOW 16 -+#define TCP_ECN_ECT_PERMANENT 32 - - enum tcp_tw_status { - TCP_TW_SUCCESS = 0, -@@ -779,6 +781,15 @@ static inline void tcp_fast_path_check(struct sock *sk) - - u32 tcp_delack_max(const struct sock *sk); - -+static inline void tcp_set_ecn_low_from_dst(struct sock *sk, -+ const struct dst_entry *dst) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ -+ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) -+ tp->ecn_flags |= TCP_ECN_LOW; -+} -+ - /* Compute the actual rto_min value */ - static inline u32 tcp_rto_min(const struct sock *sk) - { -@@ -884,6 +895,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) - return max_t(s64, t1 - t0, 0); - } - -+static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) -+{ -+ return max_t(s32, t1 - t0, 0); -+} -+ - /* provide the departure time in us unit */ - static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) - { -@@ -973,9 +989,14 @@ struct tcp_skb_cb { - /* pkts S/ACKed so far upon tx of skb, incl retrans: */ - __u32 delivered; - /* start of send pipeline phase */ -- u64 first_tx_mstamp; -+ u32 first_tx_mstamp; - /* when we reached the "delivered" count */ -- u64 delivered_mstamp; -+ u32 delivered_mstamp; -+#define TCPCB_IN_FLIGHT_BITS 20 -+#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) -+ u32 in_flight:20, /* packets in flight at transmit */ -+ unused2:12; -+ u32 lost; /* packets lost so far upon tx of skb */ - } tx; /* only used for outgoing skbs */ - union { - struct inet_skb_parm h4; -@@ -1088,6 +1109,7 @@ enum tcp_ca_event { - CA_EVENT_LOSS, /* loss timeout */ - CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ - CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ -+ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ - }; - - /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ -@@ -1110,7 +1132,11 @@ enum tcp_ca_ack_event_flags { - #define TCP_CONG_NON_RESTRICTED 0x1 - /* Requires ECN/ECT set on all packets */ - #define TCP_CONG_NEEDS_ECN 0x2 --#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) -+/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ -+#define TCP_CONG_WANTS_CE_EVENTS 0x4 -+#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ -+ TCP_CONG_NEEDS_ECN | \ -+ TCP_CONG_WANTS_CE_EVENTS) - - union tcp_cc_info; - -@@ -1130,10 +1156,13 @@ struct ack_sample { - */ - struct rate_sample { - u64 prior_mstamp; /* starting timestamp for interval */ -+ u32 prior_lost; /* tp->lost at "prior_mstamp" */ - u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ - u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ -+ u32 tx_in_flight; /* packets in flight at starting timestamp */ -+ s32 lost; /* number of packets lost over interval */ - s32 delivered; /* number of packets delivered over interval */ -- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ -+ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ - long interval_us; /* time for tp->delivered to incr "delivered" */ - u32 snd_interval_us; /* snd interval for delivered packets */ - u32 rcv_interval_us; /* rcv interval for delivered packets */ -@@ -1144,7 +1173,9 @@ struct rate_sample { - u32 last_end_seq; /* end_seq of most recently ACKed packet */ - bool is_app_limited; /* is sample from packet with bubble in pipe? */ - bool is_retrans; /* is sample from retransmission? */ -+ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ - bool is_ack_delayed; /* is this (likely) a delayed ACK? */ -+ bool is_ece; /* did this ACK have ECN marked? */ - }; - - struct tcp_congestion_ops { -@@ -1168,8 +1199,11 @@ struct tcp_congestion_ops { - /* hook for packet ack accounting (optional) */ - void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); - -- /* override sysctl_tcp_min_tso_segs */ -- u32 (*min_tso_segs)(struct sock *sk); -+ /* pick target number of segments per TSO/GSO skb (optional): */ -+ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); -+ -+ /* react to a specific lost skb (optional) */ -+ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); - - /* call when packets are delivered to update cwnd and pacing rate, - * after all the ca_state processing. (optional) -@@ -1235,6 +1269,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) - } - #endif - -+static inline bool tcp_ca_wants_ce_events(const struct sock *sk) -+{ -+ const struct inet_connection_sock *icsk = inet_csk(sk); -+ -+ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | -+ TCP_CONG_WANTS_CE_EVENTS); -+} -+ - static inline bool tcp_ca_needs_ecn(const struct sock *sk) - { - const struct inet_connection_sock *icsk = inet_csk(sk); -@@ -1254,6 +1296,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) - void tcp_set_ca_state(struct sock *sk, const u8 ca_state); - - /* From tcp_rate.c */ -+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); - void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); - void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, - struct rate_sample *rs); -@@ -1266,6 +1309,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) - return t1 > t2 || (t1 == t2 && after(seq1, seq2)); - } - -+/* If a retransmit failed due to local qdisc congestion or other local issues, -+ * then we may have called tcp_set_skb_tso_segs() to increase the number of -+ * segments in the skb without increasing the tx.in_flight. In all other cases, -+ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We -+ * do not have the state to know whether a retransmit failed due to local qdisc -+ * congestion or other local issues, so to avoid spurious warnings we consider -+ * that any skb marked lost may have suffered that fate. -+ */ -+static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, -+ u32 skb_sacked_flags, -+ u32 tx_in_flight) -+{ -+ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); -+} -+ - /* These functions determine how the current flow behaves in respect of SACK - * handling. SACK is negotiated with the peer, and therefore it can vary - * between different flows. -@@ -2417,7 +2475,7 @@ struct tcp_plb_state { - u8 consec_cong_rounds:5, /* consecutive congested rounds */ - unused:3; - u32 pause_until; /* jiffies32 when PLB can resume rerouting */ --}; -+} __attribute__ ((__packed__)); - - static inline void tcp_plb_init(const struct sock *sk, - struct tcp_plb_state *plb) -diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h -index 86bb2e8b17c9..9d9a3eb2ce9b 100644 ---- a/include/uapi/linux/inet_diag.h -+++ b/include/uapi/linux/inet_diag.h -@@ -229,6 +229,29 @@ struct tcp_bbr_info { - __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ - __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ - __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ -+ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ -+ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ -+ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ -+ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ -+ __u8 bbr_mode; /* current bbr_mode in state machine */ -+ __u8 bbr_phase; /* current state machine phase */ -+ __u8 unused1; /* alignment padding; not used yet */ -+ __u8 bbr_version; /* BBR algorithm version */ -+ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ -+ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ -+ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ -+}; -+ -+/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ -+enum tcp_bbr_phase { -+ BBR_PHASE_INVALID = 0, -+ BBR_PHASE_STARTUP = 1, -+ BBR_PHASE_DRAIN = 2, -+ BBR_PHASE_PROBE_RTT = 3, -+ BBR_PHASE_PROBE_BW_UP = 4, -+ BBR_PHASE_PROBE_BW_DOWN = 5, -+ BBR_PHASE_PROBE_BW_CRUISE = 6, -+ BBR_PHASE_PROBE_BW_REFILL = 7, - }; - - union tcp_cc_info { -diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h -index 3b687d20c9ed..a7c30c243b54 100644 ---- a/include/uapi/linux/rtnetlink.h -+++ b/include/uapi/linux/rtnetlink.h -@@ -507,12 +507,14 @@ enum { - #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ - #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ - #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) -+#define RTAX_FEATURE_ECN_LOW (1 << 5) - - #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ - RTAX_FEATURE_SACK | \ - RTAX_FEATURE_TIMESTAMP | \ - RTAX_FEATURE_ALLFRAG | \ -- RTAX_FEATURE_TCP_USEC_TS) -+ RTAX_FEATURE_TCP_USEC_TS | \ -+ RTAX_FEATURE_ECN_LOW) - - struct rta_session { - __u8 proto; -diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h -index dbf896f3146c..4702cd2f1ffc 100644 ---- a/include/uapi/linux/tcp.h -+++ b/include/uapi/linux/tcp.h -@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail { - #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ - #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ - #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ -+#define TCPI_OPT_ECN_LOW 128 /* Low-latency ECN configured at init */ - - /* - * Sender's congestion state indicating normal or abnormal situations -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index 6d2c97f8e9ef..ddc116ef22cb 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -669,15 +669,18 @@ config TCP_CONG_BBR - default n - help - -- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to -- maximize network utilization and minimize queues. It builds an explicit -- model of the bottleneck delivery rate and path round-trip propagation -- delay. It tolerates packet loss and delay unrelated to congestion. It -- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can -- coexist with flows that use loss-based congestion control, and can -- operate with shallow buffers, deep buffers, bufferbloat, policers, or -- AQM schemes that do not provide a delay signal. It requires the fq -- ("Fair Queue") pacing packet scheduler. -+ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a -+ model-based congestion control algorithm that aims to maximize -+ network utilization, keep queues and retransmit rates low, and to be -+ able to coexist with Reno/CUBIC in common scenarios. It builds an -+ explicit model of the network path. It tolerates a targeted degree -+ of random packet loss and delay. It can operate over LAN, WAN, -+ cellular, wifi, or cable modem links, and can use shallow-threshold -+ ECN signals. It can coexist to some degree with flows that use -+ loss-based congestion control, and can operate with shallow buffers, -+ deep buffers, bufferbloat, policers, or AQM schemes that do not -+ provide a delay signal. It requires pacing, using either TCP internal -+ pacing or the fq ("Fair Queue") pacing packet scheduler. - - choice - prompt "Default TCP congestion control" -diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c -index 554804774628..2279e6e7bc9c 100644 ---- a/net/ipv4/bpf_tcp_ca.c -+++ b/net/ipv4/bpf_tcp_ca.c -@@ -280,11 +280,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp - { - } - --static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) -+static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) - { - return 0; - } - -+static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) -+{ -+} -+ - static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, - const struct rate_sample *rs) - { -@@ -315,7 +319,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { - .cwnd_event = bpf_tcp_ca_cwnd_event, - .in_ack_event = bpf_tcp_ca_in_ack_event, - .pkts_acked = bpf_tcp_ca_pkts_acked, -- .min_tso_segs = bpf_tcp_ca_min_tso_segs, -+ .tso_segs = bpf_tcp_ca_tso_segs, -+ .skb_marked_lost = bpf_tcp_ca_skb_marked_lost, - .cong_control = bpf_tcp_ca_cong_control, - .undo_cwnd = bpf_tcp_ca_undo_cwnd, - .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, -diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index 4f77bd862e95..fd3a5551eda7 100644 ---- a/net/ipv4/tcp.c -+++ b/net/ipv4/tcp.c -@@ -3384,6 +3384,7 @@ int tcp_disconnect(struct sock *sk, int flags) - tp->rx_opt.dsack = 0; - tp->rx_opt.num_sacks = 0; - tp->rcv_ooopack = 0; -+ tp->fast_ack_mode = 0; - - - /* Clean up fastopen related fields */ -@@ -4110,6 +4111,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) - info->tcpi_options |= TCPI_OPT_ECN; - if (tp->ecn_flags & TCP_ECN_SEEN) - info->tcpi_options |= TCPI_OPT_ECN_SEEN; -+ if (tp->ecn_flags & TCP_ECN_LOW) -+ info->tcpi_options |= TCPI_OPT_ECN_LOW; - if (tp->syn_data_acked) - info->tcpi_options |= TCPI_OPT_SYN_DATA; - if (tp->tcp_usec_ts) -diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c -index 760941e55153..a180fa648d5e 100644 ---- a/net/ipv4/tcp_bbr.c -+++ b/net/ipv4/tcp_bbr.c -@@ -1,18 +1,19 @@ --/* Bottleneck Bandwidth and RTT (BBR) congestion control -+/* BBR (Bottleneck Bandwidth and RTT) congestion control - * -- * BBR congestion control computes the sending rate based on the delivery -- * rate (throughput) estimated from ACKs. In a nutshell: -+ * BBR is a model-based congestion control algorithm that aims for low queues, -+ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the -+ * network path, it uses measurements of bandwidth and RTT, as well as (if they -+ * occur) packet loss and/or shallow-threshold ECN signals. Note that although -+ * it can use ECN or loss signals explicitly, it does not require either; it -+ * can bound its in-flight data based on its estimate of the BDP. - * -- * On each ACK, update our model of the network path: -- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) -- * min_rtt = windowed_min(rtt, 10 seconds) -- * pacing_rate = pacing_gain * bottleneck_bandwidth -- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) -- * -- * The core algorithm does not react directly to packet losses or delays, -- * although BBR may adjust the size of next send per ACK when loss is -- * observed, or adjust the sending rate if it estimates there is a -- * traffic policer, in order to keep the drop rate reasonable. -+ * The model has both higher and lower bounds for the operating range: -+ * lo: bw_lo, inflight_lo: conservative short-term lower bound -+ * hi: bw_hi, inflight_hi: robust long-term upper bound -+ * The bandwidth-probing time scale is (a) extended dynamically based on -+ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by -+ * an interactive wall-clock time-scale to be more scalable and responsive -+ * than Reno and CUBIC. - * - * Here is a state transition diagram for BBR: - * -@@ -65,6 +66,13 @@ - #include - #include - -+#include -+#include "tcp_dctcp.h" -+ -+#define BBR_VERSION 3 -+ -+#define bbr_param(sk,name) (bbr_ ## name) -+ - /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth - * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. - * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. -@@ -85,36 +93,41 @@ enum bbr_mode { - BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ - }; - -+/* How does the incoming ACK stream relate to our bandwidth probing? */ -+enum bbr_ack_phase { -+ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ -+ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ -+ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ -+ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ -+ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ -+}; -+ - /* BBR congestion control block */ - struct bbr { - u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ - u32 min_rtt_stamp; /* timestamp of min_rtt_us */ - u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ -- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ -- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ -+ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ -+ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ - u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ - u64 cycle_mstamp; /* time of this cycle phase start */ -- u32 mode:3, /* current bbr_mode in state machine */ -+ u32 mode:2, /* current bbr_mode in state machine */ - prev_ca_state:3, /* CA state on previous ACK */ -- packet_conservation:1, /* use packet conservation? */ - round_start:1, /* start of packet-timed tx->ack round? */ -+ ce_state:1, /* If most recent data has CE bit set */ -+ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ -+ try_fast_path:1, /* can we take fast path? */ - idle_restart:1, /* restarting after idle? */ - probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ -- unused:13, -- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ -- lt_rtt_cnt:7, /* round trips in long-term interval */ -- lt_use_bw:1; /* use lt_bw as our bw estimate? */ -- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ -- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ -- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ -- u32 lt_last_lost; /* LT intvl start: tp->lost */ -+ init_cwnd:7, /* initial cwnd */ -+ unused_1:10; - u32 pacing_gain:10, /* current gain for setting pacing rate */ - cwnd_gain:10, /* current gain for setting cwnd */ - full_bw_reached:1, /* reached full bw in Startup? */ - full_bw_cnt:2, /* number of rounds without large bw gains */ -- cycle_idx:3, /* current index in pacing_gain cycle array */ -+ cycle_idx:2, /* current index in pacing_gain cycle array */ - has_seen_rtt:1, /* have we seen an RTT sample yet? */ -- unused_b:5; -+ unused_2:6; - u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ - u32 full_bw; /* recent bw, to estimate if pipe is full */ - -@@ -124,19 +137,67 @@ struct bbr { - u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ - extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ - extra_acked_win_idx:1, /* current index in extra_acked array */ -- unused_c:6; -+ /* BBR v3 state: */ -+ full_bw_now:1, /* recently reached full bw plateau? */ -+ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ -+ loss_in_cycle:1, /* packet loss in this cycle? */ -+ ecn_in_cycle:1, /* ECN in this cycle? */ -+ unused_3:1; -+ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ -+ u32 undo_bw_lo; /* bw_lo before latest losses */ -+ u32 undo_inflight_lo; /* inflight_lo before latest losses */ -+ u32 undo_inflight_hi; /* inflight_hi before latest losses */ -+ u32 bw_latest; /* max delivered bw in last round trip */ -+ u32 bw_lo; /* lower bound on sending bandwidth */ -+ u32 bw_hi[2]; /* max recent measured bw sample */ -+ u32 inflight_latest; /* max delivered data in last round trip */ -+ u32 inflight_lo; /* lower bound of inflight data range */ -+ u32 inflight_hi; /* upper bound of inflight data range */ -+ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ -+ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ -+ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ -+ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ -+ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ -+ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ -+ bw_probe_samples:1, /* rate samples reflect bw probing? */ -+ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ -+ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ -+ rounds_since_probe:8, /* packet-timed rounds since probed bw */ -+ loss_round_start:1, /* loss_round_delivered round trip? */ -+ loss_in_round:1, /* loss marked in this round trip? */ -+ ecn_in_round:1, /* ECN marked in this round trip? */ -+ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ -+ loss_events_in_round:4,/* losses in STARTUP round */ -+ initialized:1; /* has bbr_init() been called? */ -+ u32 alpha_last_delivered; /* tp->delivered at alpha update */ -+ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ -+ -+ u8 unused_4; /* to preserve alignment */ -+ struct tcp_plb_state plb; - }; - --#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ -+struct bbr_context { -+ u32 sample_bw; -+}; - --/* Window length of bw filter (in rounds): */ --static const int bbr_bw_rtts = CYCLE_LEN + 2; - /* Window length of min_rtt filter (in sec): */ - static const u32 bbr_min_rtt_win_sec = 10; - /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ - static const u32 bbr_probe_rtt_mode_ms = 200; --/* Skip TSO below the following bandwidth (bits/sec): */ --static const int bbr_min_tso_rate = 1200000; -+/* Window length of probe_rtt_min_us filter (in ms), and consequently the -+ * typical interval between PROBE_RTT mode entries. The default is 5000ms. -+ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC -+ */ -+static const u32 bbr_probe_rtt_win_ms = 5000; -+/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ -+static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; -+ -+/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting -+ * in bigger TSO bursts. We cut the RTT-based allowance in half -+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance -+ * is below 1500 bytes after 6 * ~500 usec = 3ms. -+ */ -+static const u32 bbr_tso_rtt_shift = 9; - - /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. - * In order to help drive the network toward lower queues and low latency while -@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; - */ - static const int bbr_pacing_margin_percent = 1; - --/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain -+/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value - * that will allow a smoothly increasing pacing rate that will double each RTT - * and send the same number of packets per RTT that an un-paced, slow-starting - * Reno or CUBIC flow would: - */ --static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; --/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain -+static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; -+/* The gain for deriving startup cwnd: */ -+static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; -+/* The pacing gain in BBR_DRAIN is calculated to typically drain - * the queue created in BBR_STARTUP in a single round: - */ - static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; -@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; - static const int bbr_cwnd_gain = BBR_UNIT * 2; - /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ - static const int bbr_pacing_gain[] = { -- BBR_UNIT * 5 / 4, /* probe for more available bw */ -- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ -- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ -- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ -+ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ -+ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ -+ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ -+ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ -+}; -+enum bbr_pacing_gain_phase { -+ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ -+ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ -+ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ -+ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ - }; --/* Randomize the starting gain cycling phase over N phases: */ --static const u32 bbr_cycle_rand = 7; - - /* Try to keep at least this many packets in flight, if things go smoothly. For - * smooth functioning, a sliding window protocol ACKing every other packet -@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; - */ - static const u32 bbr_cwnd_min_target = 4; - --/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ -+/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ - /* If bw has increased significantly (1.25x), there may be more bw available: */ - static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; - /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ - static const u32 bbr_full_bw_cnt = 3; - --/* "long-term" ("LT") bandwidth estimator parameters... */ --/* The minimum number of rounds in an LT bw sampling interval: */ --static const u32 bbr_lt_intvl_min_rtts = 4; --/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ --static const u32 bbr_lt_loss_thresh = 50; --/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ --static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; --/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ --static const u32 bbr_lt_bw_diff = 4000 / 8; --/* If we estimate we're policed, use lt_bw for this many round trips: */ --static const u32 bbr_lt_bw_max_rtts = 48; -- - /* Gain factor for adding extra_acked to target cwnd: */ - static const int bbr_extra_acked_gain = BBR_UNIT; - /* Window length of extra_acked window. */ -@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; - /* Time period for clamping cwnd increment due to ack aggregation */ - static const u32 bbr_extra_acked_max_us = 100 * 1000; - -+/* Flags to control BBR ECN-related behavior... */ -+ -+/* Ensure ACKs only ACK packets with consistent ECN CE status? */ -+static const bool bbr_precise_ece_ack = true; -+ -+/* Max RTT (in usec) at which to use sender-side ECN logic. -+ * Disabled when 0 (ECN allowed at any RTT). -+ */ -+static const u32 bbr_ecn_max_rtt_us = 5000; -+ -+/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. -+ * No loss response when 0. -+ */ -+static const u32 bbr_beta = BBR_UNIT * 30 / 100; -+ -+/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ -+static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; -+ -+/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly -+ * to congestion if the bottleneck is congested when the flow starts up. -+ */ -+static const u32 bbr_ecn_alpha_init = BBR_UNIT; -+ -+/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. -+ * No ECN based bounding when 0. -+ */ -+static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ -+ -+/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. -+ * Scaled by BBR_SCALE. Disabled when 0. -+ */ -+static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ -+ -+/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN -+ * clears then make the first round's increment to inflight_hi the following -+ * fraction of inflight_hi. -+ */ -+static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; -+ -+/* Estimate bw probing has gone too far if loss rate exceeds this level. */ -+static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ -+ -+/* Slow down for a packet loss recovered by TLP? */ -+static const bool bbr_loss_probe_recovery = true; -+ -+/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, -+ * and loss rate is higher than bbr_loss_thresh. -+ * Disabled if 0. -+ */ -+static const u32 bbr_full_loss_cnt = 6; -+ -+/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh -+ * meets this count. -+ */ -+static const u32 bbr_full_ecn_cnt = 2; -+ -+/* Fraction of unutilized headroom to try to leave in path upon high loss. */ -+static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; -+ -+/* How much do we increase cwnd_gain when probing for bandwidth in -+ * BBR_BW_PROBE_UP? This specifies the increment in units of -+ * BBR_UNIT/4. The default is 1, meaning 0.25. -+ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). -+ */ -+static const u32 bbr_bw_probe_cwnd_gain = 1; -+ -+/* Max number of packet-timed rounds to wait before probing for bandwidth. If -+ * we want to tolerate 1% random loss per round, and not have this cut our -+ * inflight too much, we must probe for bw periodically on roughly this scale. -+ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. -+ * We aim to be fair with Reno/CUBIC up to a BDP of at least: -+ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets -+ */ -+static const u32 bbr_bw_probe_max_rounds = 63; -+ -+/* Max amount of randomness to inject in round counting for Reno-coexistence. -+ */ -+static const u32 bbr_bw_probe_rand_rounds = 2; -+ -+/* Use BBR-native probe time scale starting at this many usec. -+ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: -+ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs -+ */ -+static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ -+ -+/* Use BBR-native probes spread over this many usec: */ -+static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ -+ -+/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ -+static const bool bbr_fast_path = true; -+ -+/* Use fast ack mode? */ -+static const bool bbr_fast_ack_mode = true; -+ -+static u32 bbr_max_bw(const struct sock *sk); -+static u32 bbr_bw(const struct sock *sk); -+static void bbr_exit_probe_rtt(struct sock *sk); -+static void bbr_reset_congestion_signals(struct sock *sk); -+static void bbr_run_loss_probe_recovery(struct sock *sk); -+ - static void bbr_check_probe_rtt_done(struct sock *sk); - -+/* This connection can use ECN if both endpoints have signaled ECN support in -+ * the handshake and the per-route settings indicated this is a -+ * shallow-threshold ECN environment, meaning both: -+ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and -+ * (b) TCP endpoints provide precise ACKs that only ACK data segments -+ * with consistent ECN CE status -+ */ -+static bool bbr_can_use_ecn(const struct sock *sk) -+{ -+ return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) && -+ (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW); -+} -+ - /* Do we estimate that STARTUP filled the pipe? */ - static bool bbr_full_bw_reached(const struct sock *sk) - { -@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) - /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ - static u32 bbr_max_bw(const struct sock *sk) - { -- struct bbr *bbr = inet_csk_ca(sk); -+ const struct bbr *bbr = inet_csk_ca(sk); - -- return minmax_get(&bbr->bw); -+ return max(bbr->bw_hi[0], bbr->bw_hi[1]); - } - - /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ - static u32 bbr_bw(const struct sock *sk) - { -- struct bbr *bbr = inet_csk_ca(sk); -+ const struct bbr *bbr = inet_csk_ca(sk); - -- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); -+ return min(bbr_max_bw(sk), bbr->bw_lo); - } - - /* Return maximum extra acked in past k-2k round trips, -@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk) - * The order here is chosen carefully to avoid overflow of u64. This should - * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. - */ --static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) -+static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, -+ int margin) - { - unsigned int mss = tcp_sk(sk)->mss_cache; - - rate *= mss; - rate *= gain; - rate >>= BBR_SCALE; -- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); -- return rate >> BW_SCALE; -+ rate *= USEC_PER_SEC / 100 * (100 - margin); -+ rate >>= BW_SCALE; -+ rate = max(rate, 1ULL); -+ return rate; -+} -+ -+static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) -+{ -+ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); - } - - /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ -@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) - { - u64 rate = bw; - -- rate = bbr_rate_bytes_per_sec(sk, rate, gain); -+ rate = bbr_rate_bytes_per_sec(sk, rate, gain, -+ bbr_pacing_margin_percent); - rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); - return rate; - } - --/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ -+/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ - static void bbr_init_pacing_rate_from_rtt(struct sock *sk) - { - struct tcp_sock *tp = tcp_sk(sk); -@@ -279,7 +456,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) - bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; - do_div(bw, rtt_us); - WRITE_ONCE(sk->sk_pacing_rate, -- bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); -+ bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain))); - } - - /* Pace using current bw estimate and a gain factor. */ -@@ -295,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) - WRITE_ONCE(sk->sk_pacing_rate, rate); - } - --/* override sysctl_tcp_min_tso_segs */ --__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) -+/* Return the number of segments BBR would like in a TSO/GSO skb, given a -+ * particular max gso size as a constraint. TODO: make this simpler and more -+ * consistent by switching bbr to just call tcp_tso_autosize(). -+ */ -+static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, -+ u32 gso_max_size) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 segs, r; -+ u64 bytes; -+ -+ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ -+ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); -+ -+ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every -+ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. -+ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) -+ */ -+ if (bbr_param(sk, tso_rtt_shift)) { -+ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); -+ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ -+ bytes += GSO_LEGACY_MAX_SIZE >> r; -+ } -+ -+ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); -+ segs = max_t(u32, bytes / mss_now, -+ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); -+ return segs; -+} -+ -+/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ -+__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) - { -- return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; -+ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); - } - -+/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ - static u32 bbr_tso_segs_goal(struct sock *sk) - { - struct tcp_sock *tp = tcp_sk(sk); -- u32 segs, bytes; -- -- /* Sort of tcp_tso_autosize() but ignoring -- * driver provided sk_gso_max_size. -- */ -- bytes = min_t(unsigned long, -- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), -- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); -- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); - -- return min(segs, 0x7FU); -+ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); - } - - /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ -@@ -334,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) - struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); - -- if (event == CA_EVENT_TX_START && tp->app_limited) { -+ if (event == CA_EVENT_TX_START) { -+ if (!tp->app_limited) -+ return; - bbr->idle_restart = 1; - bbr->ack_epoch_mstamp = tp->tcp_mstamp; - bbr->ack_epoch_acked = 0; -@@ -345,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) - bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); - else if (bbr->mode == BBR_PROBE_RTT) - bbr_check_probe_rtt_done(sk); -+ } else if ((event == CA_EVENT_ECN_IS_CE || -+ event == CA_EVENT_ECN_NO_CE) && -+ bbr_can_use_ecn(sk) && -+ bbr_param(sk, precise_ece_ack)) { -+ u32 state = bbr->ce_state; -+ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); -+ bbr->ce_state = state; -+ } else if (event == CA_EVENT_TLP_RECOVERY && -+ bbr_param(sk, loss_probe_recovery)) { -+ bbr_run_loss_probe_recovery(sk); - } - } - -@@ -367,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) - * default. This should only happen when the connection is not using TCP - * timestamps and has retransmitted all of the SYN/SYNACK/data packets - * ACKed so far. In this case, an RTO can cut cwnd to 1, in which -- * case we need to slow-start up toward something safe: TCP_INIT_CWND. -+ * case we need to slow-start up toward something safe: initial cwnd. - */ - if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ -- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ -+ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ - - w = (u64)bw * bbr->min_rtt_us; - -@@ -387,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) - * - one skb in sending host Qdisc, - * - one skb in sending host TSO/GSO engine - * - one skb being received by receiver host LRO/GRO/delayed-ACK engine -- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because -- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, -+ * Don't worry, at low rates this won't bloat cwnd because -+ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, - * which allows 2 outstanding 2-packet sequences, to try to keep pipe - * full even with ACK-every-other-packet delayed ACKs. - */ - static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) - { - struct bbr *bbr = inet_csk_ca(sk); -+ u32 tso_segs_goal; - -- /* Allow enough full-sized skbs in flight to utilize end systems. */ -- cwnd += 3 * bbr_tso_segs_goal(sk); -- -- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ -- cwnd = (cwnd + 1) & ~1U; -+ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); - -+ /* Allow enough full-sized skbs in flight to utilize end systems. */ -+ cwnd = max_t(u32, cwnd, tso_segs_goal); -+ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); - /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ -- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) -+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) - cwnd += 2; - - return cwnd; -@@ -458,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) - { - u32 max_aggr_cwnd, aggr_cwnd = 0; - -- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { -+ if (bbr_param(sk, extra_acked_gain)) { - max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) - / BW_UNIT; -- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) -+ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) - >> BBR_SCALE; - aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); - } -@@ -469,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) - return aggr_cwnd; - } - --/* An optimization in BBR to reduce losses: On the first round of recovery, we -- * follow the packet conservation principle: send P packets per P packets acked. -- * After that, we slow-start and send at most 2*P packets per P packets acked. -- * After recovery finishes, or upon undo, we restore the cwnd we had when -- * recovery started (capped by the target cwnd based on estimated BDP). -- * -- * TODO(ycheng/ncardwell): implement a rate-based approach. -- */ --static bool bbr_set_cwnd_to_recover_or_restore( -- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) -+/* Returns the cwnd for PROBE_RTT mode. */ -+static u32 bbr_probe_rtt_cwnd(struct sock *sk) - { -- struct tcp_sock *tp = tcp_sk(sk); -- struct bbr *bbr = inet_csk_ca(sk); -- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; -- u32 cwnd = tcp_snd_cwnd(tp); -- -- /* An ACK for P pkts should release at most 2*P packets. We do this -- * in two steps. First, here we deduct the number of lost packets. -- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. -- */ -- if (rs->losses > 0) -- cwnd = max_t(s32, cwnd - rs->losses, 1); -- -- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { -- /* Starting 1st round of Recovery, so do packet conservation. */ -- bbr->packet_conservation = 1; -- bbr->next_rtt_delivered = tp->delivered; /* start round now */ -- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ -- cwnd = tcp_packets_in_flight(tp) + acked; -- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { -- /* Exiting loss recovery; restore cwnd saved before recovery. */ -- cwnd = max(cwnd, bbr->prior_cwnd); -- bbr->packet_conservation = 0; -- } -- bbr->prev_ca_state = state; -- -- if (bbr->packet_conservation) { -- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); -- return true; /* yes, using packet conservation */ -- } -- *new_cwnd = cwnd; -- return false; -+ return max_t(u32, bbr_param(sk, cwnd_min_target), -+ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); - } - - /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss - * has drawn us down below target), or snap down to target if we're above it. - */ - static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, -- u32 acked, u32 bw, int gain) -+ u32 acked, u32 bw, int gain, u32 cwnd, -+ struct bbr_context *ctx) - { - struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); -- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; -+ u32 target_cwnd = 0; - - if (!acked) - goto done; /* no packet fully ACKed; just apply caps */ - -- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) -- goto done; -- - target_cwnd = bbr_bdp(sk, bw, gain); - - /* Increment the cwnd to account for excess ACKed data that seems -@@ -537,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, - target_cwnd += bbr_ack_aggregation_cwnd(sk); - target_cwnd = bbr_quantization_budget(sk, target_cwnd); - -- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ -- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ -- cwnd = min(cwnd + acked, target_cwnd); -- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) -- cwnd = cwnd + acked; -- cwnd = max(cwnd, bbr_cwnd_min_target); -+ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ -+ bbr->try_fast_path = 0; -+ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ -+ cwnd += acked; -+ if (cwnd >= target_cwnd) { -+ cwnd = target_cwnd; -+ bbr->try_fast_path = 1; -+ } -+ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { -+ cwnd += acked; -+ } else { -+ bbr->try_fast_path = 1; -+ } - -+ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); - done: -- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ -+ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ - if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ -- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); --} -- --/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ --static bool bbr_is_next_cycle_phase(struct sock *sk, -- const struct rate_sample *rs) --{ -- struct tcp_sock *tp = tcp_sk(sk); -- struct bbr *bbr = inet_csk_ca(sk); -- bool is_full_length = -- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > -- bbr->min_rtt_us; -- u32 inflight, bw; -- -- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully -- * use the pipe without increasing the queue. -- */ -- if (bbr->pacing_gain == BBR_UNIT) -- return is_full_length; /* just use wall clock time */ -- -- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); -- bw = bbr_max_bw(sk); -- -- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at -- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is -- * small (e.g. on a LAN). We do not persist if packets are lost, since -- * a path with small buffers may not hold that much. -- */ -- if (bbr->pacing_gain > BBR_UNIT) -- return is_full_length && -- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ -- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); -- -- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw -- * probing didn't find more bw. If inflight falls to match BDP then we -- * estimate queue is drained; persisting would underutilize the pipe. -- */ -- return is_full_length || -- inflight <= bbr_inflight(sk, bw, BBR_UNIT); --} -- --static void bbr_advance_cycle_phase(struct sock *sk) --{ -- struct tcp_sock *tp = tcp_sk(sk); -- struct bbr *bbr = inet_csk_ca(sk); -- -- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); -- bbr->cycle_mstamp = tp->delivered_mstamp; --} -- --/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ --static void bbr_update_cycle_phase(struct sock *sk, -- const struct rate_sample *rs) --{ -- struct bbr *bbr = inet_csk_ca(sk); -- -- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) -- bbr_advance_cycle_phase(sk); -+ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), -+ bbr_probe_rtt_cwnd(sk))); - } - - static void bbr_reset_startup_mode(struct sock *sk) -@@ -614,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk) - bbr->mode = BBR_STARTUP; - } - --static void bbr_reset_probe_bw_mode(struct sock *sk) --{ -- struct bbr *bbr = inet_csk_ca(sk); -- -- bbr->mode = BBR_PROBE_BW; -- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); -- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ --} -- --static void bbr_reset_mode(struct sock *sk) --{ -- if (!bbr_full_bw_reached(sk)) -- bbr_reset_startup_mode(sk); -- else -- bbr_reset_probe_bw_mode(sk); --} -- --/* Start a new long-term sampling interval. */ --static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) --{ -- struct tcp_sock *tp = tcp_sk(sk); -- struct bbr *bbr = inet_csk_ca(sk); -- -- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); -- bbr->lt_last_delivered = tp->delivered; -- bbr->lt_last_lost = tp->lost; -- bbr->lt_rtt_cnt = 0; --} -- --/* Completely reset long-term bandwidth sampling. */ --static void bbr_reset_lt_bw_sampling(struct sock *sk) --{ -- struct bbr *bbr = inet_csk_ca(sk); -- -- bbr->lt_bw = 0; -- bbr->lt_use_bw = 0; -- bbr->lt_is_sampling = false; -- bbr_reset_lt_bw_sampling_interval(sk); --} -- --/* Long-term bw sampling interval is done. Estimate whether we're policed. */ --static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) --{ -- struct bbr *bbr = inet_csk_ca(sk); -- u32 diff; -- -- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ -- /* Is new bw close to the lt_bw from the previous interval? */ -- diff = abs(bw - bbr->lt_bw); -- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || -- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= -- bbr_lt_bw_diff)) { -- /* All criteria are met; estimate we're policed. */ -- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ -- bbr->lt_use_bw = 1; -- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ -- bbr->lt_rtt_cnt = 0; -- return; -- } -- } -- bbr->lt_bw = bw; -- bbr_reset_lt_bw_sampling_interval(sk); --} -- --/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of -- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and -- * explicitly models their policed rate, to reduce unnecessary losses. We -- * estimate that we're policed if we see 2 consecutive sampling intervals with -- * consistent throughput and high packet loss. If we think we're being policed, -- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. -+/* See if we have reached next round trip. Upon start of the new round, -+ * returns packets delivered since previous round start plus this ACK. - */ --static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) --{ -- struct tcp_sock *tp = tcp_sk(sk); -- struct bbr *bbr = inet_csk_ca(sk); -- u32 lost, delivered; -- u64 bw; -- u32 t; -- -- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ -- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && -- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { -- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ -- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ -- } -- return; -- } -- -- /* Wait for the first loss before sampling, to let the policer exhaust -- * its tokens and estimate the steady-state rate allowed by the policer. -- * Starting samples earlier includes bursts that over-estimate the bw. -- */ -- if (!bbr->lt_is_sampling) { -- if (!rs->losses) -- return; -- bbr_reset_lt_bw_sampling_interval(sk); -- bbr->lt_is_sampling = true; -- } -- -- /* To avoid underestimates, reset sampling if we run out of data. */ -- if (rs->is_app_limited) { -- bbr_reset_lt_bw_sampling(sk); -- return; -- } -- -- if (bbr->round_start) -- bbr->lt_rtt_cnt++; /* count round trips in this interval */ -- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) -- return; /* sampling interval needs to be longer */ -- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { -- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ -- return; -- } -- -- /* End sampling interval when a packet is lost, so we estimate the -- * policer tokens were exhausted. Stopping the sampling before the -- * tokens are exhausted under-estimates the policed rate. -- */ -- if (!rs->losses) -- return; -- -- /* Calculate packets lost and delivered in sampling interval. */ -- lost = tp->lost - bbr->lt_last_lost; -- delivered = tp->delivered - bbr->lt_last_delivered; -- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ -- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) -- return; -- -- /* Find average delivery rate in this sampling interval. */ -- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; -- if ((s32)t < 1) -- return; /* interval is less than one ms, so wait */ -- /* Check if can multiply without overflow */ -- if (t >= ~0U / USEC_PER_MSEC) { -- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ -- return; -- } -- t *= USEC_PER_MSEC; -- bw = (u64)delivered * BW_UNIT; -- do_div(bw, t); -- bbr_lt_bw_interval_done(sk, bw); --} -- --/* Estimate the bandwidth based on how fast packets are delivered */ --static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) -+static u32 bbr_update_round_start(struct sock *sk, -+ const struct rate_sample *rs, struct bbr_context *ctx) - { - struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); -- u64 bw; -+ u32 round_delivered = 0; - - bbr->round_start = 0; -- if (rs->delivered < 0 || rs->interval_us <= 0) -- return; /* Not a valid observation */ - - /* See if we've reached the next RTT */ -- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { -+ if (rs->interval_us > 0 && -+ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { -+ round_delivered = tp->delivered - bbr->next_rtt_delivered; - bbr->next_rtt_delivered = tp->delivered; -- bbr->rtt_cnt++; - bbr->round_start = 1; -- bbr->packet_conservation = 0; - } -+ return round_delivered; -+} - -- bbr_lt_bw_sampling(sk, rs); -+/* Calculate the bandwidth based on how fast packets are delivered */ -+static void bbr_calculate_bw_sample(struct sock *sk, -+ const struct rate_sample *rs, struct bbr_context *ctx) -+{ -+ u64 bw = 0; - - /* Divide delivered by the interval to find a (lower bound) bottleneck - * bandwidth sample. Delivered is in packets and interval_us in uS and - * ratio will be <<1 for most connections. So delivered is first scaled. -+ * Round up to allow growth at low rates, even with integer division. - */ -- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); -- -- /* If this sample is application-limited, it is likely to have a very -- * low delivered count that represents application behavior rather than -- * the available network rate. Such a sample could drag down estimated -- * bw, causing needless slow-down. Thus, to continue to send at the -- * last measured network rate, we filter out app-limited samples unless -- * they describe the path bw at least as well as our bw model. -- * -- * So the goal during app-limited phase is to proceed with the best -- * network rate no matter how long. We automatically leave this -- * phase when app writes faster than the network can deliver :) -- */ -- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { -- /* Incorporate new sample into our max bw filter. */ -- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); -+ if (rs->interval_us > 0) { -+ if (WARN_ONCE(rs->delivered < 0, -+ "negative delivered: %d interval_us: %ld\n", -+ rs->delivered, rs->interval_us)) -+ return; -+ -+ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); - } -+ -+ ctx->sample_bw = bw; - } - - /* Estimates the windowed max degree of ack aggregation. -@@ -812,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) - * - * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). - * Max filter is an approximate sliding window of 5-10 (packet timed) round -- * trips. -+ * trips for non-startup phase, and 1-2 round trips for startup. - */ - static void bbr_update_ack_aggregation(struct sock *sk, - const struct rate_sample *rs) -@@ -820,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, - u32 epoch_us, expected_acked, extra_acked; - struct bbr *bbr = inet_csk_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); -+ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); - -- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || -+ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || - rs->delivered < 0 || rs->interval_us <= 0) - return; - - if (bbr->round_start) { - bbr->extra_acked_win_rtts = min(0x1F, - bbr->extra_acked_win_rtts + 1); -- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { -+ if (!bbr_full_bw_reached(sk)) -+ extra_acked_win_rtts_thresh = 1; -+ if (bbr->extra_acked_win_rtts >= -+ extra_acked_win_rtts_thresh) { - bbr->extra_acked_win_rtts = 0; - bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? - 0 : 1; -@@ -862,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, - bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; - } - --/* Estimate when the pipe is full, using the change in delivery rate: BBR -- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by -- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited -- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the -- * higher rwin, 3: we get higher delivery rate samples. Or transient -- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar -- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. -- */ --static void bbr_check_full_bw_reached(struct sock *sk, -- const struct rate_sample *rs) --{ -- struct bbr *bbr = inet_csk_ca(sk); -- u32 bw_thresh; -- -- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) -- return; -- -- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; -- if (bbr_max_bw(sk) >= bw_thresh) { -- bbr->full_bw = bbr_max_bw(sk); -- bbr->full_bw_cnt = 0; -- return; -- } -- ++bbr->full_bw_cnt; -- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; --} -- --/* If pipe is probably full, drain the queue and then enter steady-state. */ --static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) --{ -- struct bbr *bbr = inet_csk_ca(sk); -- -- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { -- bbr->mode = BBR_DRAIN; /* drain queue we created */ -- tcp_sk(sk)->snd_ssthresh = -- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); -- } /* fall through to check if in-flight is already small: */ -- if (bbr->mode == BBR_DRAIN && -- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= -- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) -- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ --} -- - static void bbr_check_probe_rtt_done(struct sock *sk) - { - struct tcp_sock *tp = tcp_sk(sk); -@@ -914,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) - after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) - return; - -- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ -+ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ - tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); -- bbr_reset_mode(sk); -+ bbr_exit_probe_rtt(sk); - } - - /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and -@@ -942,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) - { - struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); -- bool filter_expired; -+ bool probe_rtt_expired, min_rtt_expired; -+ u32 expire; - -- /* Track min RTT seen in the min_rtt_win_sec filter window: */ -- filter_expired = after(tcp_jiffies32, -- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); -+ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ -+ expire = bbr->probe_rtt_min_stamp + -+ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); -+ probe_rtt_expired = after(tcp_jiffies32, expire); - if (rs->rtt_us >= 0 && -- (rs->rtt_us < bbr->min_rtt_us || -- (filter_expired && !rs->is_ack_delayed))) { -- bbr->min_rtt_us = rs->rtt_us; -- bbr->min_rtt_stamp = tcp_jiffies32; -+ (rs->rtt_us < bbr->probe_rtt_min_us || -+ (probe_rtt_expired && !rs->is_ack_delayed))) { -+ bbr->probe_rtt_min_us = rs->rtt_us; -+ bbr->probe_rtt_min_stamp = tcp_jiffies32; -+ } -+ /* Track min RTT seen in the min_rtt_win_sec filter window: */ -+ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; -+ min_rtt_expired = after(tcp_jiffies32, expire); -+ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || -+ min_rtt_expired) { -+ bbr->min_rtt_us = bbr->probe_rtt_min_us; -+ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; - } - -- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && -+ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && - !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { - bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ - bbr_save_cwnd(sk); /* note cwnd so we can restore it */ - bbr->probe_rtt_done_stamp = 0; -+ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; -+ bbr->next_rtt_delivered = tp->delivered; - } - - if (bbr->mode == BBR_PROBE_RTT) { -@@ -967,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) - (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; - /* Maintain min packets in flight for max(200 ms, 1 round). */ - if (!bbr->probe_rtt_done_stamp && -- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { -+ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { - bbr->probe_rtt_done_stamp = tcp_jiffies32 + -- msecs_to_jiffies(bbr_probe_rtt_mode_ms); -+ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); - bbr->probe_rtt_round_done = 0; - bbr->next_rtt_delivered = tp->delivered; - } else if (bbr->probe_rtt_done_stamp) { -@@ -990,18 +945,20 @@ static void bbr_update_gains(struct sock *sk) - - switch (bbr->mode) { - case BBR_STARTUP: -- bbr->pacing_gain = bbr_high_gain; -- bbr->cwnd_gain = bbr_high_gain; -+ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); -+ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); - break; - case BBR_DRAIN: -- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ -- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ -+ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ -+ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ - break; - case BBR_PROBE_BW: -- bbr->pacing_gain = (bbr->lt_use_bw ? -- BBR_UNIT : -- bbr_pacing_gain[bbr->cycle_idx]); -- bbr->cwnd_gain = bbr_cwnd_gain; -+ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; -+ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); -+ if (bbr_param(sk, bw_probe_cwnd_gain) && -+ bbr->cycle_idx == BBR_BW_PROBE_UP) -+ bbr->cwnd_gain += -+ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; - break; - case BBR_PROBE_RTT: - bbr->pacing_gain = BBR_UNIT; -@@ -1013,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk) - } - } - --static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) -+__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) - { -- bbr_update_bw(sk, rs); -- bbr_update_ack_aggregation(sk, rs); -- bbr_update_cycle_phase(sk, rs); -- bbr_check_full_bw_reached(sk, rs); -- bbr_check_drain(sk, rs); -- bbr_update_min_rtt(sk, rs); -- bbr_update_gains(sk); -+ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ -+ return 3; - } - --__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) -+/* Incorporate a new bw sample into the current window of our max filter. */ -+static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) - { - struct bbr *bbr = inet_csk_ca(sk); -- u32 bw; -- -- bbr_update_model(sk, rs); - -- bw = bbr_bw(sk); -- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); -- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); -+ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); - } - --__bpf_kfunc static void bbr_init(struct sock *sk) -+/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ -+static void bbr_advance_max_bw_filter(struct sock *sk) - { -- struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); - -- bbr->prior_cwnd = 0; -- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; -- bbr->rtt_cnt = 0; -- bbr->next_rtt_delivered = tp->delivered; -- bbr->prev_ca_state = TCP_CA_Open; -- bbr->packet_conservation = 0; -- -- bbr->probe_rtt_done_stamp = 0; -- bbr->probe_rtt_round_done = 0; -- bbr->min_rtt_us = tcp_min_rtt(tp); -- bbr->min_rtt_stamp = tcp_jiffies32; -- -- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ -+ if (!bbr->bw_hi[1]) -+ return; /* no samples in this window; remember old window */ -+ bbr->bw_hi[0] = bbr->bw_hi[1]; -+ bbr->bw_hi[1] = 0; -+} - -- bbr->has_seen_rtt = 0; -- bbr_init_pacing_rate_from_rtt(sk); -+/* Reset the estimator for reaching full bandwidth based on bw plateau. */ -+static void bbr_reset_full_bw(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); - -- bbr->round_start = 0; -- bbr->idle_restart = 0; -- bbr->full_bw_reached = 0; - bbr->full_bw = 0; - bbr->full_bw_cnt = 0; -- bbr->cycle_mstamp = 0; -- bbr->cycle_idx = 0; -- bbr_reset_lt_bw_sampling(sk); -- bbr_reset_startup_mode(sk); -+ bbr->full_bw_now = 0; -+} - -- bbr->ack_epoch_mstamp = tp->tcp_mstamp; -- bbr->ack_epoch_acked = 0; -- bbr->extra_acked_win_rtts = 0; -- bbr->extra_acked_win_idx = 0; -- bbr->extra_acked[0] = 0; -- bbr->extra_acked[1] = 0; -+/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ -+static u32 bbr_target_inflight(struct sock *sk) -+{ -+ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); - -- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); -+ return min(bdp, tcp_sk(sk)->snd_cwnd); - } - --__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) -+static bool bbr_is_probing_bandwidth(struct sock *sk) - { -- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ -- return 3; -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ return (bbr->mode == BBR_STARTUP) || -+ (bbr->mode == BBR_PROBE_BW && -+ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || -+ bbr->cycle_idx == BBR_BW_PROBE_UP)); -+} -+ -+/* Has the given amount of time elapsed since we marked the phase start? */ -+static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) -+{ -+ const struct tcp_sock *tp = tcp_sk(sk); -+ const struct bbr *bbr = inet_csk_ca(sk); -+ -+ return tcp_stamp_us_delta(tp->tcp_mstamp, -+ bbr->cycle_mstamp + interval_us) > 0; -+} -+ -+static void bbr_handle_queue_too_high_in_startup(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 bdp; /* estimated BDP in packets, with quantization budget */ -+ -+ bbr->full_bw_reached = 1; -+ -+ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); -+ bbr->inflight_hi = max(bdp, bbr->inflight_latest); -+} -+ -+/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ -+static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || -+ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) -+ return; -+ -+ if (ce_ratio >= bbr_param(sk, ecn_thresh)) -+ bbr->startup_ecn_rounds++; -+ else -+ bbr->startup_ecn_rounds = 0; -+ -+ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { -+ bbr_handle_queue_too_high_in_startup(sk); -+ return; -+ } -+} -+ -+/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ -+static int bbr_update_ecn_alpha(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct net *net = sock_net(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ s32 delivered, delivered_ce; -+ u64 alpha, ce_ratio; -+ u32 gain; -+ bool want_ecn_alpha; -+ -+ /* See if we should use ECN sender logic for this connection. */ -+ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && -+ bbr_param(sk, ecn_factor) && -+ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || -+ !bbr_ecn_max_rtt_us)) -+ bbr->ecn_eligible = 1; -+ -+ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ -+ want_ecn_alpha = (bbr->ecn_eligible || -+ (bbr_can_use_ecn(sk) && -+ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); -+ if (!want_ecn_alpha) -+ return -1; -+ -+ delivered = tp->delivered - bbr->alpha_last_delivered; -+ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; -+ -+ if (delivered == 0 || /* avoid divide by zero */ -+ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ -+ return -1; -+ -+ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); -+ ce_ratio = (u64)delivered_ce << BBR_SCALE; -+ do_div(ce_ratio, delivered); -+ -+ gain = bbr_param(sk, ecn_alpha_gain); -+ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; -+ alpha += (gain * ce_ratio) >> BBR_SCALE; -+ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); -+ -+ bbr->alpha_last_delivered = tp->delivered; -+ bbr->alpha_last_delivered_ce = tp->delivered_ce; -+ -+ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); -+ return (int)ce_ratio; - } - --/* In theory BBR does not need to undo the cwnd since it does not -- * always reduce cwnd on losses (see bbr_main()). Keep it for now. -+/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 -+ * flow label) if it encounters sustained congestion in the form of ECN marks. - */ --__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) -+static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr->round_start && ce_ratio >= 0) -+ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); -+ -+ tcp_plb_check_rehash(sk, &bbr->plb); -+} -+ -+/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ -+static void bbr_raise_inflight_hi_slope(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 growth_this_round, cnt; -+ -+ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ -+ growth_this_round = 1 << bbr->bw_probe_up_rounds; -+ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); -+ cnt = tcp_snd_cwnd(tp) / growth_this_round; -+ cnt = max(cnt, 1U); -+ bbr->bw_probe_up_cnt = cnt; -+} -+ -+/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ -+static void bbr_probe_inflight_hi_upward(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 delta; -+ -+ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) -+ return; /* not fully using inflight_hi, so don't grow it */ -+ -+ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ -+ bbr->bw_probe_up_acks += rs->acked_sacked; -+ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { -+ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; -+ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; -+ bbr->inflight_hi += delta; -+ bbr->try_fast_path = 0; /* Need to update cwnd */ -+ } -+ -+ if (bbr->round_start) -+ bbr_raise_inflight_hi_slope(sk); -+} -+ -+/* Does loss/ECN rate for this sample say inflight is "too high"? -+ * This is used by both the bbr_check_loss_too_high_in_startup() function, -+ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which -+ * uses it to notice when loss/ECN rates suggest inflight is too high. -+ */ -+static bool bbr_is_inflight_too_high(const struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ const struct bbr *bbr = inet_csk_ca(sk); -+ u32 loss_thresh, ecn_thresh; -+ -+ if (rs->lost > 0 && rs->tx_in_flight) { -+ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> -+ BBR_SCALE; -+ if (rs->lost > loss_thresh) { -+ return true; -+ } -+ } -+ -+ if (rs->delivered_ce > 0 && rs->delivered > 0 && -+ bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) { -+ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> -+ BBR_SCALE; -+ if (rs->delivered_ce > ecn_thresh) { -+ return true; -+ } -+ } -+ -+ return false; -+} -+ -+/* Calculate the tx_in_flight level that corresponded to excessive loss. -+ * We find "lost_prefix" segs of the skb where loss rate went too high, -+ * by solving for "lost_prefix" in the following equation: -+ * lost / inflight >= loss_thresh -+ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh -+ * Then we take that equation, convert it to fixed point, and -+ * round up to the nearest packet. -+ */ -+static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, -+ const struct rate_sample *rs, -+ const struct sk_buff *skb) -+{ -+ const struct tcp_sock *tp = tcp_sk(sk); -+ u32 loss_thresh = bbr_param(sk, loss_thresh); -+ u32 pcount, divisor, inflight_hi; -+ s32 inflight_prev, lost_prev; -+ u64 loss_budget, lost_prefix; -+ -+ pcount = tcp_skb_pcount(skb); -+ -+ /* How much data was in flight before this skb? */ -+ inflight_prev = rs->tx_in_flight - pcount; -+ if (inflight_prev < 0) { -+ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( -+ pcount, -+ TCP_SKB_CB(skb)->sacked, -+ rs->tx_in_flight), -+ "tx_in_flight: %u pcount: %u reneg: %u", -+ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); -+ return ~0U; -+ } -+ -+ /* How much inflight data was marked lost before this skb? */ -+ lost_prev = rs->lost - pcount; -+ if (WARN_ONCE(lost_prev < 0, -+ "cwnd: %u ca: %d out: %u lost: %u pif: %u " -+ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " -+ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", -+ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, -+ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), -+ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, -+ rs->lost, lost_prev, pcount, -+ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, -+ tp->is_sack_reneg)) -+ return ~0U; -+ -+ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ -+ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; -+ loss_budget >>= BBR_SCALE; -+ if (lost_prev >= loss_budget) { -+ lost_prefix = 0; /* previous losses crossed loss_thresh */ -+ } else { -+ lost_prefix = loss_budget - lost_prev; -+ lost_prefix <<= BBR_SCALE; -+ divisor = BBR_UNIT - loss_thresh; -+ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ -+ return ~0U; -+ do_div(lost_prefix, divisor); -+ } -+ -+ inflight_hi = inflight_prev + lost_prefix; -+ return inflight_hi; -+} -+ -+/* If loss/ECN rates during probing indicated we may have overfilled a -+ * buffer, return an operating point that tries to leave unutilized headroom in -+ * the path for other flows, for fairness convergence and lower RTTs and loss. -+ */ -+static u32 bbr_inflight_with_headroom(const struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 headroom, headroom_fraction; -+ -+ if (bbr->inflight_hi == ~0U) -+ return ~0U; -+ -+ headroom_fraction = bbr_param(sk, inflight_headroom); -+ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; -+ headroom = max(headroom, 1U); -+ return max_t(s32, bbr->inflight_hi - headroom, -+ bbr_param(sk, cwnd_min_target)); -+} -+ -+/* Bound cwnd to a sensible level, based on our current probing state -+ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). -+ */ -+static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 cap; -+ -+ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() -+ * and thus cong_control() without first initializing us(!). -+ */ -+ if (!bbr->initialized) -+ return; -+ -+ cap = ~0U; -+ if (bbr->mode == BBR_PROBE_BW && -+ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { -+ /* Probe to see if more packets fit in the path. */ -+ cap = bbr->inflight_hi; -+ } else { -+ if (bbr->mode == BBR_PROBE_RTT || -+ (bbr->mode == BBR_PROBE_BW && -+ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) -+ cap = bbr_inflight_with_headroom(sk); -+ } -+ /* Adapt to any loss/ECN since our last bw probe. */ -+ cap = min(cap, bbr->inflight_lo); -+ -+ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); -+ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); -+} -+ -+/* How should we multiplicatively cut bw or inflight limits based on ECN? */ -+static u32 bbr_ecn_cut(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ return BBR_UNIT - -+ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); -+} -+ -+/* Init lower bounds if have not inited yet. */ -+static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (init_bw && bbr->bw_lo == ~0U) -+ bbr->bw_lo = bbr_max_bw(sk); -+ if (bbr->inflight_lo == ~0U) -+ bbr->inflight_lo = tcp_snd_cwnd(tp); -+} -+ -+/* Reduce bw and inflight to (1 - beta). */ -+static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) -+{ -+ struct bbr* bbr = inet_csk_ca(sk); -+ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); -+ -+ *bw = max_t(u32, bbr->bw_latest, -+ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); -+ *inflight = max_t(u32, bbr->inflight_latest, -+ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); -+} -+ -+/* Reduce inflight to (1 - alpha*ecn_factor). */ -+static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 ecn_cut = bbr_ecn_cut(sk); -+ -+ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; -+} -+ -+/* Estimate a short-term lower bound on the capacity available now, based -+ * on measurements of the current delivery process and recent history. When we -+ * are seeing loss/ECN at times when we are not probing bw, then conservatively -+ * move toward flow balance by multiplicatively cutting our short-term -+ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a -+ * multiplicative decrease in order to converge to a lower capacity in time -+ * logarithmic in the magnitude of the decrease. -+ * -+ * However, we do not cut our short-term estimates lower than the current rate -+ * and volume of delivered data from this round trip, since from the current -+ * delivery process we can estimate the measured capacity available now. -+ * -+ * Anything faster than that approach would knowingly risk high loss, which can -+ * cause low bw for Reno/CUBIC and high loss recovery latency for -+ * request/response flows using any congestion control. -+ */ -+static void bbr_adapt_lower_bounds(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 ecn_inflight_lo = ~0U; -+ -+ /* We only use lower-bound estimates when not probing bw. -+ * When probing we need to push inflight higher to probe bw. -+ */ -+ if (bbr_is_probing_bandwidth(sk)) -+ return; -+ -+ /* ECN response. */ -+ if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) { -+ bbr_init_lower_bounds(sk, false); -+ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); -+ } -+ -+ /* Loss response. */ -+ if (bbr->loss_in_round) { -+ bbr_init_lower_bounds(sk, true); -+ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); -+ } -+ -+ /* Adjust to the lower of the levels implied by loss/ECN. */ -+ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); -+ bbr->bw_lo = max(1U, bbr->bw_lo); -+} -+ -+/* Reset any short-term lower-bound adaptation to congestion, so that we can -+ * push our inflight up. -+ */ -+static void bbr_reset_lower_bounds(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->bw_lo = ~0U; -+ bbr->inflight_lo = ~0U; -+} -+ -+/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state -+ * machine phase where we adapt our lower bound based on congestion signals. -+ */ -+static void bbr_reset_congestion_signals(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->loss_in_round = 0; -+ bbr->ecn_in_round = 0; -+ bbr->loss_in_cycle = 0; -+ bbr->ecn_in_cycle = 0; -+ bbr->bw_latest = 0; -+ bbr->inflight_latest = 0; -+} -+ -+static void bbr_exit_loss_recovery(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); -+ bbr->try_fast_path = 0; /* bound cwnd using latest model */ -+} -+ -+/* Update rate and volume of delivered data from latest round trip. */ -+static void bbr_update_latest_delivery_signals( -+ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->loss_round_start = 0; -+ if (rs->interval_us <= 0 || !rs->acked_sacked) -+ return; /* Not a valid observation */ -+ -+ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); -+ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); -+ -+ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { -+ bbr->loss_round_delivered = tp->delivered; -+ bbr->loss_round_start = 1; /* mark start of new round trip */ -+ } -+} -+ -+/* Once per round, reset filter for latest rate and volume of delivered data. */ -+static void bbr_advance_latest_delivery_signals( -+ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ /* If ACK matches a TLP retransmit, persist the filter. If we detect -+ * that a TLP retransmit plugged a tail loss, we'll want to remember -+ * how much data the path delivered before the tail loss. -+ */ -+ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { -+ bbr->bw_latest = ctx->sample_bw; -+ bbr->inflight_latest = rs->delivered; -+ } -+} -+ -+/* Update (most of) our congestion signals: track the recent rate and volume of -+ * delivered data, presence of loss, and EWMA degree of ECN marking. -+ */ -+static void bbr_update_congestion_signals( -+ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) - { - struct bbr *bbr = inet_csk_ca(sk); -+ u64 bw; -+ -+ if (rs->interval_us <= 0 || !rs->acked_sacked) -+ return; /* Not a valid observation */ -+ bw = ctx->sample_bw; - -- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ -+ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) -+ bbr_take_max_bw_sample(sk, bw); -+ -+ bbr->loss_in_round |= (rs->losses > 0); -+ -+ if (!bbr->loss_round_start) -+ return; /* skip the per-round-trip updates */ -+ /* Now do per-round-trip updates. */ -+ bbr_adapt_lower_bounds(sk, rs); -+ -+ bbr->loss_in_round = 0; -+ bbr->ecn_in_round = 0; -+} -+ -+/* Bandwidth probing can cause loss. To help coexistence with loss-based -+ * congestion control we spread out our probing in a Reno-conscious way. Due to -+ * the shape of the Reno sawtooth, the time required between loss epochs for an -+ * idealized Reno flow is a number of round trips that is the BDP of that -+ * flow. We count packet-timed round trips directly, since measured RTT can -+ * vary widely, and Reno is driven by packet-timed round trips. -+ */ -+static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 rounds; -+ -+ /* Random loss can shave some small percentage off of our inflight -+ * in each round. To survive this, flows need robust periodic probes. -+ */ -+ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); -+ return bbr->rounds_since_probe >= rounds; -+} -+ -+/* How long do we want to wait before probing for bandwidth (and risking -+ * loss)? We randomize the wait, for better mixing and fairness convergence. -+ * -+ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. -+ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, -+ * (eg 4K video to a broadband user): -+ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets -+ * -+ * We bound the BBR-native inter-bw-probe wall clock time to be: -+ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time -+ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must -+ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs -+ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable -+ * amount of time to discover unutilized bw on human-scale interactive -+ * time-scales (e.g. perhaps traffic from a web page download that we -+ * were competing with is now complete). -+ */ -+static void bbr_pick_probe_wait(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ /* Decide the random round-trip bound for wait until probe: */ -+ bbr->rounds_since_probe = -+ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); -+ /* Decide the random wall clock bound for wait until probe: */ -+ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + -+ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); -+} -+ -+static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->cycle_idx = cycle_idx; -+ /* New phase, so need to update cwnd and pacing rate. */ -+ bbr->try_fast_path = 0; -+} -+ -+/* Send at estimated bw to fill the pipe, but not queue. We need this phase -+ * before PROBE_UP, because as soon as we send faster than the available bw -+ * we will start building a queue, and if the buffer is shallow we can cause -+ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and -+ * inflight_hi estimates will underestimate. -+ */ -+static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr_reset_lower_bounds(sk); -+ bbr->bw_probe_up_rounds = bw_probe_up_rounds; -+ bbr->bw_probe_up_acks = 0; -+ bbr->stopped_risky_probe = 0; -+ bbr->ack_phase = BBR_ACKS_REFILLING; -+ bbr->next_rtt_delivered = tp->delivered; -+ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); -+} -+ -+/* Now probe max deliverable data rate and volume. */ -+static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; -+ bbr->next_rtt_delivered = tp->delivered; -+ bbr->cycle_mstamp = tp->tcp_mstamp; -+ bbr_reset_full_bw(sk); -+ bbr->full_bw = ctx->sample_bw; -+ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); -+ bbr_raise_inflight_hi_slope(sk); -+} -+ -+/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall -+ * clock time at which to probe beyond an inflight that we think to be -+ * safe. This will knowingly risk packet loss, so we want to do this rarely, to -+ * keep packet loss rates low. Also start a round-trip counter, to probe faster -+ * if we estimate a Reno flow at our BDP would probe faster. -+ */ -+static void bbr_start_bw_probe_down(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr_reset_congestion_signals(sk); -+ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ -+ bbr_pick_probe_wait(sk); -+ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ -+ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; -+ bbr->next_rtt_delivered = tp->delivered; -+ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); -+} -+ -+/* Cruise: maintain what we estimate to be a neutral, conservative -+ * operating point, without attempting to probe up for bandwidth or down for -+ * RTT, and only reducing inflight in response to loss/ECN signals. -+ */ -+static void bbr_start_bw_probe_cruise(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr->inflight_lo != ~0U) -+ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); -+ -+ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); -+} -+ -+/* Loss and/or ECN rate is too high while probing. -+ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. -+ */ -+static void bbr_handle_inflight_too_high(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ const u32 beta = bbr_param(sk, beta); -+ -+ bbr->prev_probe_too_high = 1; -+ bbr->bw_probe_samples = 0; /* only react once per probe */ -+ /* If we are app-limited then we are not robustly -+ * probing the max volume of inflight data we think -+ * might be safe (analogous to how app-limited bw -+ * samples are not known to be robustly probing bw). -+ */ -+ if (!rs->is_app_limited) { -+ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, -+ (u64)bbr_target_inflight(sk) * -+ (BBR_UNIT - beta) >> BBR_SCALE); -+ } -+ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) -+ bbr_start_bw_probe_down(sk); -+} -+ -+/* If we're seeing bw and loss samples reflecting our bw probing, adapt -+ * using the signals we see. If loss or ECN mark rate gets too high, then adapt -+ * inflight_hi downward. If we're able to push inflight higher without such -+ * signals, push higher: adapt inflight_hi upward. -+ */ -+static bool bbr_adapt_upper_bounds(struct sock *sk, -+ const struct rate_sample *rs, -+ struct bbr_context *ctx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ /* Track when we'll see bw/loss samples resulting from our bw probes. */ -+ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) -+ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; -+ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { -+ /* End of samples from bw probing phase. */ -+ bbr->bw_probe_samples = 0; -+ bbr->ack_phase = BBR_ACKS_INIT; -+ /* At this point in the cycle, our current bw sample is also -+ * our best recent chance at finding the highest available bw -+ * for this flow. So now is the best time to forget the bw -+ * samples from the previous cycle, by advancing the window. -+ */ -+ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) -+ bbr_advance_max_bw_filter(sk); -+ /* If we had an inflight_hi, then probed and pushed inflight all -+ * the way up to hit that inflight_hi without seeing any -+ * high loss/ECN in all the resulting ACKs from that probing, -+ * then probe up again, this time letting inflight persist at -+ * inflight_hi for a round trip, then accelerating beyond. -+ */ -+ if (bbr->mode == BBR_PROBE_BW && -+ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { -+ bbr_start_bw_probe_refill(sk, 0); -+ return true; /* yes, decided state transition */ -+ } -+ } -+ if (bbr_is_inflight_too_high(sk, rs)) { -+ if (bbr->bw_probe_samples) /* sample is from bw probing? */ -+ bbr_handle_inflight_too_high(sk, rs); -+ } else { -+ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ -+ -+ if (bbr->inflight_hi == ~0U) -+ return false; /* no excess queue signals yet */ -+ -+ /* To be resilient to random loss, we must raise bw/inflight_hi -+ * if we observe in any phase that a higher level is safe. -+ */ -+ if (rs->tx_in_flight > bbr->inflight_hi) { -+ bbr->inflight_hi = rs->tx_in_flight; -+ } -+ -+ if (bbr->mode == BBR_PROBE_BW && -+ bbr->cycle_idx == BBR_BW_PROBE_UP) -+ bbr_probe_inflight_hi_upward(sk, rs); -+ } -+ -+ return false; -+} -+ -+/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ -+static bool bbr_check_time_to_probe_bw(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 n; -+ -+ /* If we seem to be at an operating point where we are not seeing loss -+ * but we are seeing ECN marks, then when the ECN marks cease we reprobe -+ * quickly (in case cross-traffic has ceased and freed up bw). -+ */ -+ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && -+ bbr->ecn_in_cycle && !bbr->loss_in_cycle && -+ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { -+ /* Calculate n so that when bbr_raise_inflight_hi_slope() -+ * computes growth_this_round as 2^n it will be roughly the -+ * desired volume of data (inflight_hi*ecn_reprobe_gain). -+ */ -+ n = ilog2((((u64)bbr->inflight_hi * -+ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); -+ bbr_start_bw_probe_refill(sk, n); -+ return true; -+ } -+ -+ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || -+ bbr_is_reno_coexistence_probe_time(sk)) { -+ bbr_start_bw_probe_refill(sk, 0); -+ return true; -+ } -+ return false; -+} -+ -+/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ -+static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) -+{ -+ /* Always need to pull inflight down to leave headroom in queue. */ -+ if (inflight > bbr_inflight_with_headroom(sk)) -+ return false; -+ -+ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); -+} -+ -+/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ -+static void bbr_update_cycle_phase(struct sock *sk, -+ const struct rate_sample *rs, -+ struct bbr_context *ctx) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ bool is_bw_probe_done = false; -+ u32 inflight, bw; -+ -+ if (!bbr_full_bw_reached(sk)) -+ return; -+ -+ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ -+ if (bbr_adapt_upper_bounds(sk, rs, ctx)) -+ return; /* already decided state transition */ -+ -+ if (bbr->mode != BBR_PROBE_BW) -+ return; -+ -+ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); -+ bw = bbr_max_bw(sk); -+ -+ switch (bbr->cycle_idx) { -+ /* First we spend most of our time cruising with a pacing_gain of 1.0, -+ * which paces at the estimated bw, to try to fully use the pipe -+ * without building queue. If we encounter loss/ECN marks, we adapt -+ * by slowing down. -+ */ -+ case BBR_BW_PROBE_CRUISE: -+ if (bbr_check_time_to_probe_bw(sk, rs)) -+ return; /* already decided state transition */ -+ break; -+ -+ /* After cruising, when it's time to probe, we first "refill": we send -+ * at the estimated bw to fill the pipe, before probing higher and -+ * knowingly risking overflowing the bottleneck buffer (causing loss). -+ */ -+ case BBR_BW_PROBE_REFILL: -+ if (bbr->round_start) { -+ /* After one full round trip of sending in REFILL, we -+ * start to see bw samples reflecting our REFILL, which -+ * may be putting too much data in flight. -+ */ -+ bbr->bw_probe_samples = 1; -+ bbr_start_bw_probe_up(sk, ctx); -+ } -+ break; -+ -+ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to -+ * probe for bw. If we have not seen loss/ECN, we try to raise inflight -+ * to at least pacing_gain*BDP; note that this may take more than -+ * min_rtt if min_rtt is small (e.g. on a LAN). -+ * -+ * We terminate PROBE_UP bandwidth probing upon any of the following: -+ * -+ * (1) We've pushed inflight up to hit the inflight_hi target set in the -+ * most recent previous bw probe phase. Thus we want to start -+ * draining the queue immediately because it's very likely the most -+ * recently sent packets will fill the queue and cause drops. -+ * (2) If inflight_hi has not limited bandwidth growth recently, and -+ * yet delivered bandwidth has not increased much recently -+ * (bbr->full_bw_now). -+ * (3) Loss filter says loss rate is "too high". -+ * (4) ECN filter says ECN mark rate is "too high". -+ * -+ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() -+ */ -+ case BBR_BW_PROBE_UP: -+ if (bbr->prev_probe_too_high && -+ inflight >= bbr->inflight_hi) { -+ bbr->stopped_risky_probe = 1; -+ is_bw_probe_done = true; -+ } else { -+ if (tp->is_cwnd_limited && -+ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { -+ /* inflight_hi is limiting bw growth */ -+ bbr_reset_full_bw(sk); -+ bbr->full_bw = ctx->sample_bw; -+ } else if (bbr->full_bw_now) { -+ /* Plateau in estimated bw. Pipe looks full. */ -+ is_bw_probe_done = true; -+ } -+ } -+ if (is_bw_probe_done) { -+ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ -+ bbr_start_bw_probe_down(sk); /* restart w/ down */ -+ } -+ break; -+ -+ /* After probing in PROBE_UP, we have usually accumulated some data in -+ * the bottleneck buffer (if bw probing didn't find more bw). We next -+ * enter PROBE_DOWN to try to drain any excess data from the queue. To -+ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until -+ * our inflight is less then that target cruising point, which is the -+ * minimum of (a) the amount needed to leave headroom, and (b) the -+ * estimated BDP. Once inflight falls to match the target, we estimate -+ * the queue is drained; persisting would underutilize the pipe. -+ */ -+ case BBR_BW_PROBE_DOWN: -+ if (bbr_check_time_to_probe_bw(sk, rs)) -+ return; /* already decided state transition */ -+ if (bbr_check_time_to_cruise(sk, inflight, bw)) -+ bbr_start_bw_probe_cruise(sk); -+ break; -+ -+ default: -+ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); -+ } -+} -+ -+/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ -+static void bbr_exit_probe_rtt(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr_reset_lower_bounds(sk); -+ if (bbr_full_bw_reached(sk)) { -+ bbr->mode = BBR_PROBE_BW; -+ /* Raising inflight after PROBE_RTT may cause loss, so reset -+ * the PROBE_BW clock and schedule the next bandwidth probe for -+ * a friendly and randomized future point in time. -+ */ -+ bbr_start_bw_probe_down(sk); -+ /* Since we are exiting PROBE_RTT, we know inflight is -+ * below our estimated BDP, so it is reasonable to cruise. -+ */ -+ bbr_start_bw_probe_cruise(sk); -+ } else { -+ bbr->mode = BBR_STARTUP; -+ } -+} -+ -+/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until -+ * the end of the round in recovery to get a good estimate of how many packets -+ * have been lost, and how many we need to drain with a low pacing rate. -+ */ -+static void bbr_check_loss_too_high_in_startup(struct sock *sk, -+ const struct rate_sample *rs) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr_full_bw_reached(sk)) -+ return; -+ -+ /* For STARTUP exit, check the loss rate at the end of each round trip -+ * of Recovery episodes in STARTUP. We check the loss rate at the end -+ * of the round trip to filter out noisy/low loss and have a better -+ * sense of inflight (extent of loss), so we can drain more accurately. -+ */ -+ if (rs->losses && bbr->loss_events_in_round < 0xf) -+ bbr->loss_events_in_round++; /* update saturating counter */ -+ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && -+ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && -+ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && -+ bbr_is_inflight_too_high(sk, rs)) { -+ bbr_handle_queue_too_high_in_startup(sk); -+ return; -+ } -+ if (bbr->loss_round_start) -+ bbr->loss_events_in_round = 0; -+} -+ -+/* Estimate when the pipe is full, using the change in delivery rate: BBR -+ * estimates bw probing filled the pipe if the estimated bw hasn't changed by -+ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited -+ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the -+ * higher rwin, 3: we get higher delivery rate samples. Or transient -+ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar -+ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. -+ */ -+static void bbr_check_full_bw_reached(struct sock *sk, -+ const struct rate_sample *rs, -+ struct bbr_context *ctx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 bw_thresh, full_cnt, thresh; -+ -+ if (bbr->full_bw_now || rs->is_app_limited) -+ return; -+ -+ thresh = bbr_param(sk, full_bw_thresh); -+ full_cnt = bbr_param(sk, full_bw_cnt); -+ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; -+ if (ctx->sample_bw >= bw_thresh) { -+ bbr_reset_full_bw(sk); -+ bbr->full_bw = ctx->sample_bw; -+ return; -+ } -+ if (!bbr->round_start) -+ return; -+ ++bbr->full_bw_cnt; -+ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; -+ bbr->full_bw_reached |= bbr->full_bw_now; -+} -+ -+/* If pipe is probably full, drain the queue and then enter steady-state. */ -+static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, -+ struct bbr_context *ctx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { -+ bbr->mode = BBR_DRAIN; /* drain queue we created */ -+ /* Set ssthresh to export purely for monitoring, to signal -+ * completion of initial STARTUP by setting to a non- -+ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). -+ */ -+ tcp_sk(sk)->snd_ssthresh = -+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); -+ bbr_reset_congestion_signals(sk); -+ } /* fall through to check if in-flight is already small: */ -+ if (bbr->mode == BBR_DRAIN && -+ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= -+ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { -+ bbr->mode = BBR_PROBE_BW; -+ bbr_start_bw_probe_down(sk); -+ } -+} -+ -+static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, -+ struct bbr_context *ctx) -+{ -+ bbr_update_congestion_signals(sk, rs, ctx); -+ bbr_update_ack_aggregation(sk, rs); -+ bbr_check_loss_too_high_in_startup(sk, rs); -+ bbr_check_full_bw_reached(sk, rs, ctx); -+ bbr_check_drain(sk, rs, ctx); -+ bbr_update_cycle_phase(sk, rs, ctx); -+ bbr_update_min_rtt(sk, rs); -+} -+ -+/* Fast path for app-limited case. -+ * -+ * On each ack, we execute bbr state machine, which primarily consists of: -+ * 1) update model based on new rate sample, and -+ * 2) update control based on updated model or state change. -+ * -+ * There are certain workload/scenarios, e.g. app-limited case, where -+ * either we can skip updating model or we can skip update of both model -+ * as well as control. This provides signifcant softirq cpu savings for -+ * processing incoming acks. -+ * -+ * In case of app-limited, if there is no congestion (loss/ecn) and -+ * if observed bw sample is less than current estimated bw, then we can -+ * skip some of the computation in bbr state processing: -+ * -+ * - if there is no rtt/mode/phase change: In this case, since all the -+ * parameters of the network model are constant, we can skip model -+ * as well control update. -+ * -+ * - else we can skip rest of the model update. But we still need to -+ * update the control to account for the new rtt/mode/phase. -+ * -+ * Returns whether we can take fast path or not. -+ */ -+static bool bbr_run_fast_path(struct sock *sk, bool *update_model, -+ const struct rate_sample *rs, struct bbr_context *ctx) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ u32 prev_min_rtt_us, prev_mode; -+ -+ if (bbr_param(sk, fast_path) && bbr->try_fast_path && -+ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && -+ !bbr->loss_in_round && !bbr->ecn_in_round ) { -+ prev_mode = bbr->mode; -+ prev_min_rtt_us = bbr->min_rtt_us; -+ bbr_check_drain(sk, rs, ctx); -+ bbr_update_cycle_phase(sk, rs, ctx); -+ bbr_update_min_rtt(sk, rs); -+ -+ if (bbr->mode == prev_mode && -+ bbr->min_rtt_us == prev_min_rtt_us && -+ bbr->try_fast_path) { -+ return true; -+ } -+ -+ /* Skip model update, but control still needs to be updated */ -+ *update_model = false; -+ } -+ return false; -+} -+ -+__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ struct bbr_context ctx = { 0 }; -+ bool update_model = true; -+ u32 bw, round_delivered; -+ int ce_ratio = -1; -+ -+ round_delivered = bbr_update_round_start(sk, rs, &ctx); -+ if (bbr->round_start) { -+ bbr->rounds_since_probe = -+ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); -+ ce_ratio = bbr_update_ecn_alpha(sk); -+ } -+ bbr_plb(sk, rs, ce_ratio); -+ -+ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); -+ bbr_calculate_bw_sample(sk, rs, &ctx); -+ bbr_update_latest_delivery_signals(sk, rs, &ctx); -+ -+ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) -+ goto out; -+ -+ if (update_model) -+ bbr_update_model(sk, rs, &ctx); -+ -+ bbr_update_gains(sk); -+ bw = bbr_bw(sk); -+ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); -+ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, -+ tcp_snd_cwnd(tp), &ctx); -+ bbr_bound_cwnd_for_inflight_model(sk); -+ -+out: -+ bbr_advance_latest_delivery_signals(sk, rs, &ctx); -+ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; -+ bbr->loss_in_cycle |= rs->lost > 0; -+ bbr->ecn_in_cycle |= rs->delivered_ce > 0; -+} -+ -+__bpf_kfunc static void bbr_init(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr->initialized = 1; -+ -+ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); -+ bbr->prior_cwnd = tp->prior_cwnd; -+ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; -+ bbr->next_rtt_delivered = tp->delivered; -+ bbr->prev_ca_state = TCP_CA_Open; -+ -+ bbr->probe_rtt_done_stamp = 0; -+ bbr->probe_rtt_round_done = 0; -+ bbr->probe_rtt_min_us = tcp_min_rtt(tp); -+ bbr->probe_rtt_min_stamp = tcp_jiffies32; -+ bbr->min_rtt_us = tcp_min_rtt(tp); -+ bbr->min_rtt_stamp = tcp_jiffies32; -+ -+ bbr->has_seen_rtt = 0; -+ bbr_init_pacing_rate_from_rtt(sk); -+ -+ bbr->round_start = 0; -+ bbr->idle_restart = 0; -+ bbr->full_bw_reached = 0; -+ bbr->full_bw = 0; - bbr->full_bw_cnt = 0; -- bbr_reset_lt_bw_sampling(sk); -- return tcp_snd_cwnd(tcp_sk(sk)); -+ bbr->cycle_mstamp = 0; -+ bbr->cycle_idx = 0; -+ -+ bbr_reset_startup_mode(sk); -+ -+ bbr->ack_epoch_mstamp = tp->tcp_mstamp; -+ bbr->ack_epoch_acked = 0; -+ bbr->extra_acked_win_rtts = 0; -+ bbr->extra_acked_win_idx = 0; -+ bbr->extra_acked[0] = 0; -+ bbr->extra_acked[1] = 0; -+ -+ bbr->ce_state = 0; -+ bbr->prior_rcv_nxt = tp->rcv_nxt; -+ bbr->try_fast_path = 0; -+ -+ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); -+ -+ /* Start sampling ECN mark rate after first full flight is ACKed: */ -+ bbr->loss_round_delivered = tp->delivered + 1; -+ bbr->loss_round_start = 0; -+ bbr->undo_bw_lo = 0; -+ bbr->undo_inflight_lo = 0; -+ bbr->undo_inflight_hi = 0; -+ bbr->loss_events_in_round = 0; -+ bbr->startup_ecn_rounds = 0; -+ bbr_reset_congestion_signals(sk); -+ bbr->bw_lo = ~0U; -+ bbr->bw_hi[0] = 0; -+ bbr->bw_hi[1] = 0; -+ bbr->inflight_lo = ~0U; -+ bbr->inflight_hi = ~0U; -+ bbr_reset_full_bw(sk); -+ bbr->bw_probe_up_cnt = ~0U; -+ bbr->bw_probe_up_acks = 0; -+ bbr->bw_probe_up_rounds = 0; -+ bbr->probe_wait_us = 0; -+ bbr->stopped_risky_probe = 0; -+ bbr->ack_phase = BBR_ACKS_INIT; -+ bbr->rounds_since_probe = 0; -+ bbr->bw_probe_samples = 0; -+ bbr->prev_probe_too_high = 0; -+ bbr->ecn_eligible = 0; -+ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); -+ bbr->alpha_last_delivered = 0; -+ bbr->alpha_last_delivered_ce = 0; -+ bbr->plb.pause_until = 0; -+ -+ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; -+ -+ if (bbr_can_use_ecn(sk)) -+ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; -+} -+ -+/* BBR marks the current round trip as a loss round. */ -+static void bbr_note_loss(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ /* Capture "current" data over the full round trip of loss, to -+ * have a better chance of observing the full capacity of the path. -+ */ -+ if (!bbr->loss_in_round) /* first loss in this round trip? */ -+ bbr->loss_round_delivered = tp->delivered; /* set round trip */ -+ bbr->loss_in_round = 1; -+ bbr->loss_in_cycle = 1; - } - --/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ -+/* Core TCP stack informs us that the given skb was just marked lost. */ -+__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, -+ const struct sk_buff *skb) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); -+ struct rate_sample rs = {}; -+ -+ bbr_note_loss(sk); -+ -+ if (!bbr->bw_probe_samples) -+ return; /* not an skb sent while probing for bandwidth */ -+ if (unlikely(!scb->tx.delivered_mstamp)) -+ return; /* skb was SACKed, reneged, marked lost; ignore it */ -+ /* We are probing for bandwidth. Construct a rate sample that -+ * estimates what happened in the flight leading up to this lost skb, -+ * then see if the loss rate went too high, and if so at which packet. -+ */ -+ rs.tx_in_flight = scb->tx.in_flight; -+ rs.lost = tp->lost - scb->tx.lost; -+ rs.is_app_limited = scb->tx.is_app_limited; -+ if (bbr_is_inflight_too_high(sk, &rs)) { -+ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); -+ bbr_handle_inflight_too_high(sk, &rs); -+ } -+} -+ -+static void bbr_run_loss_probe_recovery(struct sock *sk) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ struct bbr *bbr = inet_csk_ca(sk); -+ struct rate_sample rs = {0}; -+ -+ bbr_note_loss(sk); -+ -+ if (!bbr->bw_probe_samples) -+ return; /* not sent while probing for bandwidth */ -+ /* We are probing for bandwidth. Construct a rate sample that -+ * estimates what happened in the flight leading up to this -+ * loss, then see if the loss rate went too high. -+ */ -+ rs.lost = 1; /* TLP probe repaired loss of a single segment */ -+ rs.tx_in_flight = bbr->inflight_latest + rs.lost; -+ rs.is_app_limited = tp->tlp_orig_data_app_limited; -+ if (bbr_is_inflight_too_high(sk, &rs)) -+ bbr_handle_inflight_too_high(sk, &rs); -+} -+ -+/* Revert short-term model if current loss recovery event was spurious. */ -+__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) -+{ -+ struct bbr *bbr = inet_csk_ca(sk); -+ -+ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ -+ bbr->loss_in_round = 0; -+ -+ /* Revert to cwnd and other state saved before loss episode. */ -+ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); -+ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); -+ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); -+ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ -+ return bbr->prior_cwnd; -+} -+ -+/* Entering loss recovery, so save state for when we undo recovery. */ - __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) - { -+ struct bbr *bbr = inet_csk_ca(sk); -+ - bbr_save_cwnd(sk); -+ /* For undo, save state that adapts based on loss signal. */ -+ bbr->undo_bw_lo = bbr->bw_lo; -+ bbr->undo_inflight_lo = bbr->inflight_lo; -+ bbr->undo_inflight_hi = bbr->inflight_hi; - return tcp_sk(sk)->snd_ssthresh; - } - -+static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) -+{ -+ switch (bbr->mode) { -+ case BBR_STARTUP: -+ return BBR_PHASE_STARTUP; -+ case BBR_DRAIN: -+ return BBR_PHASE_DRAIN; -+ case BBR_PROBE_BW: -+ break; -+ case BBR_PROBE_RTT: -+ return BBR_PHASE_PROBE_RTT; -+ default: -+ return BBR_PHASE_INVALID; -+ } -+ switch (bbr->cycle_idx) { -+ case BBR_BW_PROBE_UP: -+ return BBR_PHASE_PROBE_BW_UP; -+ case BBR_BW_PROBE_DOWN: -+ return BBR_PHASE_PROBE_BW_DOWN; -+ case BBR_BW_PROBE_CRUISE: -+ return BBR_PHASE_PROBE_BW_CRUISE; -+ case BBR_BW_PROBE_REFILL: -+ return BBR_PHASE_PROBE_BW_REFILL; -+ default: -+ return BBR_PHASE_INVALID; -+ } -+} -+ - static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, -- union tcp_cc_info *info) -+ union tcp_cc_info *info) - { - if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || - ext & (1 << (INET_DIAG_VEGASINFO - 1))) { -- struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); -- u64 bw = bbr_bw(sk); -- -- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; -- memset(&info->bbr, 0, sizeof(info->bbr)); -- info->bbr.bbr_bw_lo = (u32)bw; -- info->bbr.bbr_bw_hi = (u32)(bw >> 32); -- info->bbr.bbr_min_rtt = bbr->min_rtt_us; -- info->bbr.bbr_pacing_gain = bbr->pacing_gain; -- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; -+ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); -+ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); -+ u64 bw_lo = bbr->bw_lo == ~0U ? -+ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); -+ struct tcp_bbr_info *bbr_info = &info->bbr; -+ -+ memset(bbr_info, 0, sizeof(*bbr_info)); -+ bbr_info->bbr_bw_lo = (u32)bw; -+ bbr_info->bbr_bw_hi = (u32)(bw >> 32); -+ bbr_info->bbr_min_rtt = bbr->min_rtt_us; -+ bbr_info->bbr_pacing_gain = bbr->pacing_gain; -+ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; -+ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; -+ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); -+ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; -+ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); -+ bbr_info->bbr_mode = bbr->mode; -+ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); -+ bbr_info->bbr_version = (__u8)BBR_VERSION; -+ bbr_info->bbr_inflight_lo = bbr->inflight_lo; -+ bbr_info->bbr_inflight_hi = bbr->inflight_hi; -+ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); - *attr = INET_DIAG_BBRINFO; -- return sizeof(info->bbr); -+ return sizeof(*bbr_info); - } - return 0; - } - - __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) - { -+ struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); - - if (new_state == TCP_CA_Loss) { -- struct rate_sample rs = { .losses = 1 }; - - bbr->prev_ca_state = TCP_CA_Loss; -- bbr->full_bw = 0; -- bbr->round_start = 1; /* treat RTO like end of a round */ -- bbr_lt_bw_sampling(sk, &rs); -+ tcp_plb_update_state_upon_rto(sk, &bbr->plb); -+ /* The tcp_write_timeout() call to sk_rethink_txhash() likely -+ * repathed this flow, so re-learn the min network RTT on the -+ * new path: -+ */ -+ bbr_reset_full_bw(sk); -+ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { -+ /* bbr_adapt_lower_bounds() needs cwnd before -+ * we suffered an RTO, to update inflight_lo: -+ */ -+ bbr->inflight_lo = -+ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); -+ } -+ } else if (bbr->prev_ca_state == TCP_CA_Loss && -+ new_state != TCP_CA_Loss) { -+ bbr_exit_loss_recovery(sk); - } - } - -+ - static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { -- .flags = TCP_CONG_NON_RESTRICTED, -+ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, - .name = "bbr", - .owner = THIS_MODULE, - .init = bbr_init, - .cong_control = bbr_main, - .sndbuf_expand = bbr_sndbuf_expand, -+ .skb_marked_lost = bbr_skb_marked_lost, - .undo_cwnd = bbr_undo_cwnd, - .cwnd_event = bbr_cwnd_event, - .ssthresh = bbr_ssthresh, -- .min_tso_segs = bbr_min_tso_segs, -+ .tso_segs = bbr_tso_segs, - .get_info = bbr_get_info, - .set_state = bbr_set_state, - }; -@@ -1159,10 +2359,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) - BTF_ID_FLAGS(func, bbr_init) - BTF_ID_FLAGS(func, bbr_main) - BTF_ID_FLAGS(func, bbr_sndbuf_expand) -+BTF_ID_FLAGS(func, bbr_skb_marked_lost) - BTF_ID_FLAGS(func, bbr_undo_cwnd) - BTF_ID_FLAGS(func, bbr_cwnd_event) - BTF_ID_FLAGS(func, bbr_ssthresh) --BTF_ID_FLAGS(func, bbr_min_tso_segs) -+BTF_ID_FLAGS(func, bbr_tso_segs) - BTF_ID_FLAGS(func, bbr_set_state) - BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) - -@@ -1195,5 +2396,12 @@ MODULE_AUTHOR("Van Jacobson "); - MODULE_AUTHOR("Neal Cardwell "); - MODULE_AUTHOR("Yuchung Cheng "); - MODULE_AUTHOR("Soheil Hassas Yeganeh "); -+MODULE_AUTHOR("Priyaranjan Jha "); -+MODULE_AUTHOR("Yousuk Seung "); -+MODULE_AUTHOR("Kevin Yang "); -+MODULE_AUTHOR("Arjun Roy "); -+MODULE_AUTHOR("David Morley "); -+ - MODULE_LICENSE("Dual BSD/GPL"); - MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); -+MODULE_VERSION(__stringify(BBR_VERSION)); -diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c -index 0306d257fa64..28f581c0dab7 100644 ---- a/net/ipv4/tcp_cong.c -+++ b/net/ipv4/tcp_cong.c -@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk) - struct inet_connection_sock *icsk = inet_csk(sk); - - tcp_sk(sk)->prior_ssthresh = 0; -+ tcp_sk(sk)->fast_ack_mode = 0; - if (icsk->icsk_ca_ops->init) - icsk->icsk_ca_ops->init(sk); - if (tcp_ca_needs_ecn(sk)) -diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index cc05ec1faac8..7e995b9ea034 100644 ---- a/net/ipv4/tcp_input.c -+++ b/net/ipv4/tcp_input.c -@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) - tcp_enter_quickack_mode(sk, 2); - break; - case INET_ECN_CE: -- if (tcp_ca_needs_ecn(sk)) -+ if (tcp_ca_wants_ce_events(sk)) - tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); - - if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { -@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) - tp->ecn_flags |= TCP_ECN_SEEN; - break; - default: -- if (tcp_ca_needs_ecn(sk)) -+ if (tcp_ca_wants_ce_events(sk)) - tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); - tp->ecn_flags |= TCP_ECN_SEEN; - break; -@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) - */ - static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) - { -+ struct sock *sk = (struct sock *)tp; -+ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; -+ - tp->lost += tcp_skb_pcount(skb); -+ if (ca_ops->skb_marked_lost) -+ ca_ops->skb_marked_lost(sk, skb); - } - - void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) -@@ -1501,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, - WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); - tcp_skb_pcount_add(skb, -pcount); - -+ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ -+ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, -+ "prev in_flight: %u skb in_flight: %u pcount: %u", -+ TCP_SKB_CB(prev)->tx.in_flight, -+ TCP_SKB_CB(skb)->tx.in_flight, -+ pcount)) -+ TCP_SKB_CB(skb)->tx.in_flight = 0; -+ else -+ TCP_SKB_CB(skb)->tx.in_flight -= pcount; -+ TCP_SKB_CB(prev)->tx.in_flight += pcount; -+ - /* When we're adding to gso_segs == 1, gso_size will be zero, - * in theory this shouldn't be necessary but as long as DSACK - * code can come after this skb later on it's better to keep -@@ -3799,7 +3815,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) - /* This routine deals with acks during a TLP episode and ends an episode by - * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack - */ --static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) -+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, -+ struct rate_sample *rs) - { - struct tcp_sock *tp = tcp_sk(sk); - -@@ -3816,6 +3833,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) - /* ACK advances: there was a loss, so reduce cwnd. Reset - * tlp_high_seq in tcp_init_cwnd_reduction() - */ -+ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); - tcp_init_cwnd_reduction(sk); - tcp_set_ca_state(sk, TCP_CA_CWR); - tcp_end_cwnd_reduction(sk); -@@ -3826,6 +3844,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) - FLAG_NOT_DUP | FLAG_DATA_SACKED))) { - /* Pure dupack: original and TLP probe arrived; no loss */ - tp->tlp_high_seq = 0; -+ } else { -+ /* This ACK matches a TLP retransmit. We cannot yet tell if -+ * this ACK is for the original or the TLP retransmit. -+ */ -+ rs->is_acking_tlp_retrans_seq = 1; - } - } - -@@ -3934,6 +3957,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) - - prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; - rs.prior_in_flight = tcp_packets_in_flight(tp); -+ tcp_rate_check_app_limited(sk); - - /* ts_recent update must be made after we are sure that the packet - * is in window. -@@ -4008,7 +4032,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) - tcp_rack_update_reo_wnd(sk, &rs); - - if (tp->tlp_high_seq) -- tcp_process_tlp_ack(sk, ack, flag); -+ tcp_process_tlp_ack(sk, ack, flag, &rs); - - if (tcp_ack_is_dubious(sk, flag)) { - if (!(flag & (FLAG_SND_UNA_ADVANCED | -@@ -4032,6 +4056,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) - delivered = tcp_newly_delivered(sk, delivered, flag); - lost = tp->lost - lost; /* freshly marked lost */ - rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); -+ rs.is_ece = !!(flag & FLAG_ECE); - tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); - tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); - tcp_xmit_recovery(sk, rexmit); -@@ -4051,7 +4076,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) - tcp_ack_probe(sk); - - if (tp->tlp_high_seq) -- tcp_process_tlp_ack(sk, ack, flag); -+ tcp_process_tlp_ack(sk, ack, flag, &rs); - return 1; - - old_ack: -@@ -5725,13 +5750,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) - - /* More than one full frame received... */ - if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && -+ (tp->fast_ack_mode == 1 || - /* ... and right edge of window advances far enough. - * (tcp_recvmsg() will send ACK otherwise). - * If application uses SO_RCVLOWAT, we want send ack now if - * we have not received enough bytes to satisfy the condition. - */ -- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || -- __tcp_select_window(sk) >= tp->rcv_wnd)) || -+ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || -+ __tcp_select_window(sk) >= tp->rcv_wnd))) || - /* We ACK each frame or... */ - tcp_in_quickack_mode(sk) || - /* Protocol state mandates a one-time immediate ACK */ -diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c -index bb1fe1ba867a..050a80769de6 100644 ---- a/net/ipv4/tcp_minisocks.c -+++ b/net/ipv4/tcp_minisocks.c -@@ -462,6 +462,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) - u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); - bool ca_got_dst = false; - -+ tcp_set_ecn_low_from_dst(sk, dst); -+ - if (ca_key != TCP_CA_UNSPEC) { - const struct tcp_congestion_ops *ca; - -diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 4fd746bd4d54..273fed558494 100644 ---- a/net/ipv4/tcp_output.c -+++ b/net/ipv4/tcp_output.c -@@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) - bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); - bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || - tcp_ca_needs_ecn(sk) || bpf_needs_ecn; -+ const struct dst_entry *dst = __sk_dst_get(sk); - - if (!use_ecn) { -- const struct dst_entry *dst = __sk_dst_get(sk); -- - if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) - use_ecn = true; - } -@@ -351,6 +350,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) - tp->ecn_flags = TCP_ECN_OK; - if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) - INET_ECN_xmit(sk); -+ -+ if (dst) -+ tcp_set_ecn_low_from_dst(sk, dst); - } - } - -@@ -388,7 +390,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, - th->cwr = 1; - skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; - } -- } else if (!tcp_ca_needs_ecn(sk)) { -+ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && -+ !tcp_ca_needs_ecn(sk)) { - /* ACK or retransmitted segment: clear ECT|CE */ - INET_ECN_dontxmit(sk); - } -@@ -1601,7 +1604,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, - { - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *buff; -- int old_factor; -+ int old_factor, inflight_prev; - long limit; - int nlen; - u8 flags; -@@ -1676,6 +1679,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, - - if (diff) - tcp_adjust_pcount(sk, skb, diff); -+ -+ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; -+ if (inflight_prev < 0) { -+ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( -+ old_factor, -+ TCP_SKB_CB(skb)->sacked, -+ TCP_SKB_CB(skb)->tx.in_flight), -+ "inconsistent: tx.in_flight: %u " -+ "old_factor: %d mss: %u sacked: %u " -+ "1st pcount: %d 2nd pcount: %d " -+ "1st len: %u 2nd len: %u ", -+ TCP_SKB_CB(skb)->tx.in_flight, old_factor, -+ mss_now, TCP_SKB_CB(skb)->sacked, -+ tcp_skb_pcount(skb), tcp_skb_pcount(buff), -+ skb->len, buff->len); -+ inflight_prev = 0; -+ } -+ /* Set 1st tx.in_flight as if 1st were sent by itself: */ -+ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + -+ tcp_skb_pcount(skb); -+ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ -+ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + -+ tcp_skb_pcount(skb) + -+ tcp_skb_pcount(buff); - } - - /* Link BUFF into the send queue. */ -@@ -2033,13 +2060,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, - static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) - { - const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; -- u32 min_tso, tso_segs; -- -- min_tso = ca_ops->min_tso_segs ? -- ca_ops->min_tso_segs(sk) : -- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); -+ u32 tso_segs; - -- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); -+ tso_segs = ca_ops->tso_segs ? -+ ca_ops->tso_segs(sk, mss_now) : -+ tcp_tso_autosize(sk, mss_now, -+ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); - return min_t(u32, tso_segs, sk->sk_gso_max_segs); - } - -@@ -2768,6 +2794,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, - skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); - list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); - tcp_init_tso_segs(skb, mss_now); -+ tcp_set_tx_in_flight(sk, skb); - goto repair; /* Skip network transmission */ - } - -@@ -2982,6 +3009,7 @@ void tcp_send_loss_probe(struct sock *sk) - if (WARN_ON(!skb || !tcp_skb_pcount(skb))) - goto rearm_timer; - -+ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; - if (__tcp_retransmit_skb(sk, skb, 1)) - goto rearm_timer; - -diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c -index a8f6d9d06f2e..8737f2134648 100644 ---- a/net/ipv4/tcp_rate.c -+++ b/net/ipv4/tcp_rate.c -@@ -34,6 +34,24 @@ - * ready to send in the write queue. - */ - -+void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) -+{ -+ struct tcp_sock *tp = tcp_sk(sk); -+ u32 in_flight; -+ -+ /* Check, sanitize, and record packets in flight after skb was sent. */ -+ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); -+ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, -+ "insane in_flight %u cc %s mss %u " -+ "cwnd %u pif %u %u %u %u\n", -+ in_flight, inet_csk(sk)->icsk_ca_ops->name, -+ tp->mss_cache, tp->snd_cwnd, -+ tp->packets_out, tp->retrans_out, -+ tp->sacked_out, tp->lost_out)) -+ in_flight = TCPCB_IN_FLIGHT_MAX; -+ TCP_SKB_CB(skb)->tx.in_flight = in_flight; -+} -+ - /* Snapshot the current delivery information in the skb, to generate - * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). - */ -@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) - TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; - TCP_SKB_CB(skb)->tx.delivered = tp->delivered; - TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; -+ TCP_SKB_CB(skb)->tx.lost = tp->lost; - TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; -+ tcp_set_tx_in_flight(sk, skb); - } - - /* When an skb is sacked or acked, we fill in the rate sample with the (prior) -@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, - if (!rs->prior_delivered || - tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, - scb->end_seq, rs->last_end_seq)) { -+ rs->prior_lost = scb->tx.lost; - rs->prior_delivered_ce = scb->tx.delivered_ce; - rs->prior_delivered = scb->tx.delivered; - rs->prior_mstamp = scb->tx.delivered_mstamp; - rs->is_app_limited = scb->tx.is_app_limited; - rs->is_retrans = scb->sacked & TCPCB_RETRANS; -+ rs->tx_in_flight = scb->tx.in_flight; - rs->last_end_seq = scb->end_seq; - - /* Record send time of most recently ACKed packet: */ - tp->first_tx_mstamp = tx_tstamp; - /* Find the duration of the "send phase" of this window: */ -- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, -- scb->tx.first_tx_mstamp); -+ rs->interval_us = tcp_stamp32_us_delta( -+ tp->first_tx_mstamp, -+ scb->tx.first_tx_mstamp); - - } - /* Mark off the skb delivered once it's sacked to avoid being -@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - return; - } - rs->delivered = tp->delivered - rs->prior_delivered; -+ rs->lost = tp->lost - rs->prior_lost; - - rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; - /* delivered_ce occupies less than 32 bits in the skb control block */ -@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - * longer phase. - */ - snd_us = rs->interval_us; /* send phase */ -- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, -+ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, - rs->prior_mstamp); /* ack phase */ - rs->interval_us = max(snd_us, ack_us); - -diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c -index 79064580c8c0..697270ce1ea6 100644 ---- a/net/ipv4/tcp_timer.c -+++ b/net/ipv4/tcp_timer.c -@@ -690,6 +690,7 @@ void tcp_write_timer_handler(struct sock *sk) - return; - } - -+ tcp_rate_check_app_limited(sk); - tcp_mstamp_refresh(tcp_sk(sk)); - event = icsk->icsk_pending; - --- -2.47.0.rc0 - diff --git a/patches/sys-kernel/gentoo-sources/0003-glitched-base.patch b/patches/sys-kernel/gentoo-sources/0003-glitched-base.patch deleted file mode 100644 index beb4aae..0000000 --- a/patches/sys-kernel/gentoo-sources/0003-glitched-base.patch +++ /dev/null @@ -1,822 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: [PATCH 01/17] glitched - ---- - init/Makefile | 2 +- - 1 file changed, 1 insertions(+), 1 deletions(-) - -diff --git a/init/Makefile b/init/Makefile -index baf3ab8d9d49..854e32e6aec7 100755 ---- a/init/Makefile -+++ b/init/Makefile -@@ -19,7 +19,7 @@ else - - # Maximum length of UTS_VERSION is 64 chars - filechk_uts_version = \ -- utsver=$$(echo '$(pound)'"$(build-version)" $(smp-flag-y) $(preempt-flag-y) "$(build-timestamp)" | cut -b -64); \ -+ utsver=$$(echo '$(pound)'"$(build-version)" $(smp-flag-y) $(preempt-flag-y) "siina" "$(build-timestamp)" | cut -b -64); \ - echo '$(pound)'define UTS_VERSION \""$${utsver}"\" - - # --- -2.28.0 - - -From c304f43d14e98d4bf1215fc10bc5012f554bdd8a Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 16:59:22 +0000 -Subject: [PATCH 02/17] dcache: cache_pressure = 50 decreases the rate at which - VFS caches are reclaimed - -Signed-off-by: Alexandre Frade ---- - fs/dcache.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/fs/dcache.c b/fs/dcache.c -index 361ea7ab30ea..0c5cf69b241a 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -71,7 +71,7 @@ - * If no ancestor relationship: - * arbitrary, since it's serialized on rename_lock - */ --int sysctl_vfs_cache_pressure __read_mostly = 100; -+int sysctl_vfs_cache_pressure __read_mostly = 50; - EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); - - __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); --- -2.28.0 - - -diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c -index f788cd61df21..2bfbb4213707 100644 ---- a/kernel/sched/rt.c -+++ b/kernel/sched/rt.c -@@ -15,9 +15,9 @@ __read_mostly int scheduler_running; - - /* - * part of the period that we allow rt tasks to run in us. -- * default: 0.95s -+ * XanMod default: 0.98s - */ --int sysctl_sched_rt_runtime = 950000; -+int sysctl_sched_rt_runtime = 980000; - - #ifdef CONFIG_SYSCTL - static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; --- -2.28.0 - - -From acc49f33a10f61dc66c423888cbb883ba46710e4 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 17:41:29 +0000 -Subject: [PATCH 04/17] scripts: disable the localversion "+" tag of a git repo - -Signed-off-by: Alexandre Frade ---- - scripts/setlocalversion | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/scripts/setlocalversion b/scripts/setlocalversion -index 20f2efd57b11..0552d8b9f582 100755 ---- a/scripts/setlocalversion -+++ b/scripts/setlocalversion -@@ -54,7 +54,7 @@ scm_version() - # If only the short version is requested, don't bother - # running further git commands - if $short; then -- echo "+" -+ #echo "+" - return - fi - # If we are past the tagged commit, we pretty print it. --- -2.28.0 - - -From 360c6833e07cc9fdef5746f6bc45bdbc7212288d Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" -Date: Fri, 26 Oct 2018 11:22:33 +0100 -Subject: [PATCH 06/17] infiniband: Fix __read_overflow2 error with -O3 - inlining - ---- - drivers/infiniband/core/addr.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c -index 3a98439bba83..6efc4f907f58 100644 ---- a/drivers/infiniband/core/addr.c -+++ b/drivers/infiniband/core/addr.c -@@ -820,6 +820,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, - union { - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; -+ struct sockaddr_ib _sockaddr_ib; - } sgid_addr, dgid_addr; - int ret; - --- -2.28.0 - - -From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 -From: Etienne Juvigny -Date: Mon, 3 Sep 2018 17:36:25 +0200 -Subject: [PATCH 07/17] Add Zenify option - ---- - init/Kconfig | 32 ++++++++++++++++++++++++++++++++ - 1 file changed, 32 insertions(+) - -diff --git a/init/Kconfig b/init/Kconfig -index 3ae8678e1145..da708eed0f1e 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -92,6 +92,38 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config ZENIFY -+ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" -+ default y -+ help -+ Tunes the kernel for responsiveness at the cost of throughput and power usage. -+ -+ --- Virtual Memory Subsystem --------------------------- -+ -+ Mem dirty before bg writeback..: 10 % -> 20 % -+ Mem dirty before sync writeback: 20 % -> 50 % -+ -+ --- Block Layer ---------------------------------------- -+ -+ Queue depth...............: 128 -> 512 -+ Default MQ scheduler......: mq-deadline -> bfq -+ -+ --- CFS CPU Scheduler ---------------------------------- -+ -+ Scheduling latency.............: 6 -> 3 ms -+ Minimal granularity............: 0.75 -> 0.3 ms -+ Wakeup granularity.............: 1 -> 0.5 ms -+ CPU migration cost.............: 0.5 -> 0.25 ms -+ Bandwidth slice size...........: 5 -> 3 ms -+ Ondemand fine upscaling limit..: 95 % -> 85 % -+ -+ --- MuQSS CPU Scheduler -------------------------------- -+ -+ Scheduling interval............: 6 -> 3 ms -+ ISO task max realtime use......: 70 % -> 25 % -+ Ondemand coarse upscaling limit: 80 % -> 45 % -+ Ondemand fine upscaling limit..: 95 % -> 45 % -+ - config BROKEN - bool - --- -2.28.0 - - -From e92e67143385cf285851e12aa8b7f083dd38dd24 Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Sun, 16 Jan 2011 18:57:32 -0600 -Subject: [PATCH 08/17] ZEN: Allow TCP YeAH as default congestion control - -4.4: In my tests YeAH dramatically slowed down transfers over a WLAN, - reducing throughput from ~65Mbps (CUBIC) to ~7MBps (YeAH) over 10 - seconds (netperf TCP_STREAM) including long stalls. - - Be careful when choosing this. ~heftig ---- - net/ipv4/Kconfig | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index e64e59b536d3..bfb55ef7ebbe 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -691,6 +691,9 @@ choice - config DEFAULT_VEGAS - bool "Vegas" if TCP_CONG_VEGAS=y - -+ config DEFAULT_YEAH -+ bool "YeAH" if TCP_CONG_YEAH=y -+ - config DEFAULT_VENO - bool "Veno" if TCP_CONG_VENO=y - -@@ -724,6 +727,7 @@ config DEFAULT_TCP_CONG - default "htcp" if DEFAULT_HTCP - default "hybla" if DEFAULT_HYBLA - default "vegas" if DEFAULT_VEGAS -+ default "yeah" if DEFAULT_YEAH - default "westwood" if DEFAULT_WESTWOOD - default "veno" if DEFAULT_VENO - default "reno" if DEFAULT_RENO --- -2.28.0 - - -From 76dbe7477bfde1b5e8bf29a71b5af7ab2be9b98e Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Wed, 28 Nov 2018 19:01:27 -0600 -Subject: [PATCH 09/17] zen: Use [defer+madvise] as default khugepaged defrag - strategy - -For some reason, the default strategy to respond to THP fault fallbacks -is still just madvise, meaning stall if the program wants transparent -hugepages, but don't trigger a background reclaim / compaction if THP -begins to fail allocations. This creates a snowball affect where we -still use the THP code paths, but we almost always fail once a system -has been active and busy for a while. - -The option "defer" was created for interactive systems where THP can -still improve performance. If we have to fallback to a regular page due -to an allocation failure or anything else, we will trigger a background -reclaim and compaction so future THP attempts succeed and previous -attempts eventually have their smaller pages combined without stalling -running applications. - -We still want madvise to stall applications that explicitely want THP, -so defer+madvise _does_ make a ton of sense. Make it the default for -interactive systems, especially if the kernel maintainer left -transparent hugepages on "always". - -Reasoning and details in the original patch: https://lwn.net/Articles/711248/ ---- - mm/huge_memory.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index 74300e337c3c..9277f22c10a7 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = - #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE - (1< -Date: Wed, 24 Oct 2018 16:58:52 -0300 -Subject: [PATCH 10/17] net/sched: allow configuring cake qdisc as default - -Signed-off-by: Alexandre Frade ---- - net/sched/Kconfig | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/net/sched/Kconfig b/net/sched/Kconfig -index 84badf00647e..6a922bca9f39 100644 ---- a/net/sched/Kconfig -+++ b/net/sched/Kconfig -@@ -471,6 +471,9 @@ choice - config DEFAULT_SFQ - bool "Stochastic Fair Queue" if NET_SCH_SFQ - -+ config DEFAULT_CAKE -+ bool "Common Applications Kept Enhanced" if NET_SCH_CAKE -+ - config DEFAULT_PFIFO_FAST - bool "Priority FIFO Fast" - endchoice -@@ -481,6 +484,7 @@ config DEFAULT_NET_SCH - default "fq" if DEFAULT_FQ - default "fq_codel" if DEFAULT_FQ_CODEL - default "sfq" if DEFAULT_SFQ -+ default "cake" if DEFAULT_CAKE - default "pfifo_fast" - endif - --- -2.28.0 - - -From 90240bcd90a568878738e66c0d45bed3e38e347b Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Fri, 19 Apr 2019 12:33:38 +0200 -Subject: [PATCH 12/17] Set vm.max_map_count to 262144 by default - -The value is still pretty low, and AMD64-ABI and ELF extended numbering -supports that, so we should be fine on modern x86 systems. - -This fixes crashes in some applications using more than 65535 vmas (also -affects some windows games running in wine, such as Star Citizen). ---- - include/linux/mm.h | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/include/linux/mm.h b/include/linux/mm.h -index bc05c3588aa3..b0cefe94920d 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -190,8 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) - * not a hard limit any more. Although some userspace tools can be surprised by - * that. - */ --#define MAPCOUNT_ELF_CORE_MARGIN (5) --#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) -+#define DEFAULT_MAX_MAP_COUNT (262144) - - extern int sysctl_max_map_count; - --- -2.28.0 - - -From 3a34034dba5efe91bcec491efe8c66e8087f509b Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Mon, 27 Jul 2020 00:19:18 +0200 -Subject: [PATCH 13/17] mm: bump DEFAULT_MAX_MAP_COUNT - -Some games such as Detroit: Become Human tend to be very crash prone with -lower values. ---- - include/linux/mm.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/mm.h b/include/linux/mm.h -index b0cefe94920d..890165099b07 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -190,7 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) - * not a hard limit any more. Although some userspace tools can be surprised by - * that. - */ --#define DEFAULT_MAX_MAP_COUNT (262144) -+#define DEFAULT_MAX_MAP_COUNT (16777216) - - extern int sysctl_max_map_count; - --- -2.28.0 - -From 977812938da7c7226415778c340832141d9278b7 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 25 Nov 2019 15:13:06 -0300 -Subject: [PATCH 14/17] elevator: set default scheduler to bfq for blk-mq - -Signed-off-by: Alexandre Frade ---- - block/elevator.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/block/elevator.c b/block/elevator.c -index 4eab3d70e880..79669aa39d79 100644 ---- a/block/elevator.c -+++ b/block/elevator.c -@@ -623,19 +623,19 @@ static inline bool elv_support_iosched(struct request_queue *q) - } - - /* -- * For single queue devices, default to using mq-deadline. If we have multiple -- * queues or mq-deadline is not available, default to "none". -+ * For single queue devices, default to using bfq. If we have multiple -+ * queues or bfq is not available, default to "none". - */ - static struct elevator_type *elevator_get_default(struct request_queue *q) - { - if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) - return NULL; - - if (q->nr_hw_queues != 1 && - !blk_mq_is_shared_tags(q->tag_set->flags)) - return NULL; - -- return elevator_find_get(q, "mq-deadline"); -+ return elevator_find_get(q, "bfq"); - } - - /* --- -2.28.0 - -From 3c229f434aca65c4ca61772bc03c3e0370817b92 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 3 Aug 2020 17:05:04 +0000 -Subject: [PATCH 16/17] mm: set 2 megabytes for address_space-level file - read-ahead pages size - -Signed-off-by: Alexandre Frade ---- - include/linux/pagemap.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index cf2468da68e9..007dea784451 100644 ---- a/include/linux/pagemap.h -+++ b/include/linux/pagemap.h -@@ -655,7 +655,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); - void delete_from_page_cache_batch(struct address_space *mapping, - struct pagevec *pvec); - --#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) -+#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) - - void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, - struct file *, pgoff_t index, unsigned long req_count); --- -2.28.0 - - -From 716f41cf6631f3a85834dcb67b4ce99185b6387f Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Wed, 15 Jan 2020 20:43:56 -0600 -Subject: [PATCH 17/17] ZEN: intel-pstate: Implement "enable" parameter - -If intel-pstate is compiled into the kernel, it will preempt the loading -of acpi-cpufreq so you can take advantage of hardware p-states without -any friction. - -However, intel-pstate is not completely superior to cpufreq's ondemand -for one reason. There's no concept of an up_threshold property. - -In ondemand, up_threshold essentially reduces the maximum utilization to -compare against, allowing you to hit max frequencies and turbo boost -from a much lower core utilization. - -With intel-pstate, you have the concept of minimum and maximum -performance, but no tunable that lets you define, maximum frequency -means 50% core utilization. For just this oversight, there's reasons -you may want ondemand. - -Lets support setting "enable" in kernel boot parameters. This lets -kernel maintainers include "intel_pstate=disable" statically in the -static boot parameters, but let users of the kernel override this -selection. ---- - Documentation/admin-guide/kernel-parameters.txt | 3 +++ - drivers/cpufreq/intel_pstate.c | 2 ++ - 2 files changed, 5 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index fb95fad81c79..3e92fee81e33 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -1857,6 +1857,9 @@ - disable - Do not enable intel_pstate as the default - scaling driver for the supported processors -+ enable -+ Enable intel_pstate in-case "disable" was passed -+ previously in the kernel boot parameters - passive - Use intel_pstate as a scaling driver, but configure it - to work with generic cpufreq governors (instead of -diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c -index 36a469150ff9..aee891c9b78a 100644 ---- a/drivers/cpufreq/intel_pstate.c -+++ b/drivers/cpufreq/intel_pstate.c -@@ -2845,6 +2845,8 @@ static int __init intel_pstate_setup(char *str) - if (!strcmp(str, "no_hwp")) - no_hwp = 1; - -+ if (!strcmp(str, "enable")) -+ no_load = 0; - if (!strcmp(str, "force")) - force_load = 1; - if (!strcmp(str, "hwp_only")) --- -2.28.0 - -From 379cbab18b5c75c622b93e2c5abdfac141fe9654 Mon Sep 17 00:00:00 2001 -From: Kenny Levinsen -Date: Sun, 27 Dec 2020 14:43:13 +0000 -Subject: [PATCH] ZEN: Input: evdev - use call_rcu when detaching client - -Significant time was spent on synchronize_rcu in evdev_detach_client -when applications closed evdev devices. Switching VT away from a -graphical environment commonly leads to mass input device closures, -which could lead to noticable delays on systems with many input devices. - -Replace synchronize_rcu with call_rcu, deferring reclaim of the evdev -client struct till after the RCU grace period instead of blocking the -calling application. - -While this does not solve all slow evdev fd closures, it takes care of a -good portion of them, including this simple test: - - #include - #include - - int main(int argc, char *argv[]) - { - int idx, fd; - const char *path = "/dev/input/event0"; - for (idx = 0; idx < 1000; idx++) { - if ((fd = open(path, O_RDWR)) == -1) { - return -1; - } - close(fd); - } - return 0; - } - -Time to completion of above test when run locally: - - Before: 0m27.111s - After: 0m0.018s - -Signed-off-by: Kenny Levinsen ---- - drivers/input/evdev.c | 19 +++++++++++-------- - 1 file changed, 11 insertions(+), 8 deletions(-) - -diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c -index 95f90699d2b17b..2b10fe29d2c8d9 100644 ---- a/drivers/input/evdev.c -+++ b/drivers/input/evdev.c -@@ -46,6 +46,7 @@ struct evdev_client { - struct fasync_struct *fasync; - struct evdev *evdev; - struct list_head node; -+ struct rcu_head rcu; - enum input_clock_type clk_type; - bool revoked; - unsigned long *evmasks[EV_CNT]; -@@ -377,13 +378,22 @@ static void evdev_attach_client(struct evdev *evdev, - spin_unlock(&evdev->client_lock); - } - -+static void evdev_reclaim_client(struct rcu_head *rp) -+{ -+ struct evdev_client *client = container_of(rp, struct evdev_client, rcu); -+ unsigned int i; -+ for (i = 0; i < EV_CNT; ++i) -+ bitmap_free(client->evmasks[i]); -+ kvfree(client); -+} -+ - static void evdev_detach_client(struct evdev *evdev, - struct evdev_client *client) - { - spin_lock(&evdev->client_lock); - list_del_rcu(&client->node); - spin_unlock(&evdev->client_lock); -- synchronize_rcu(); -+ call_rcu(&client->rcu, evdev_reclaim_client); - } - - static int evdev_open_device(struct evdev *evdev) -@@ -436,7 +446,6 @@ static int evdev_release(struct inode *inode, struct file *file) - { - struct evdev_client *client = file->private_data; - struct evdev *evdev = client->evdev; -- unsigned int i; - - mutex_lock(&evdev->mutex); - -@@ -448,11 +457,6 @@ static int evdev_release(struct inode *inode, struct file *file) - - evdev_detach_client(evdev, client); - -- for (i = 0; i < EV_CNT; ++i) -- bitmap_free(client->evmasks[i]); -- -- kvfree(client); -- - evdev_close_device(evdev); - - return 0; -@@ -495,7 +499,6 @@ static int evdev_open(struct inode *inode, struct file *file) - - err_free_client: - evdev_detach_client(evdev, client); -- kvfree(client); - return error; - } - - -From 2aafb56f20e4b63d8c4af172fe9d017c64bc4129 Mon Sep 17 00:00:00 2001 -From: Sultan Alsawaf -Date: Wed, 20 Oct 2021 20:50:11 -0700 -Subject: [PATCH] ZEN: mm: Lower the non-hugetlbpage pageblock size to reduce - scheduling delays - -The page allocator processes free pages in groups of pageblocks, where -the size of a pageblock is typically quite large (1024 pages without -hugetlbpage support). Pageblocks are processed atomically with the zone -lock held, which can cause severe scheduling delays on both the CPU -going through the pageblock and any other CPUs waiting to acquire the -zone lock. A frequent offender is move_freepages_block(), which is used -by rmqueue() for page allocation. - -As it turns out, there's no requirement for pageblocks to be so large, -so the pageblock order can simply be reduced to ease the scheduling -delays and zone lock contention. PAGE_ALLOC_COSTLY_ORDER is used as a -reasonable setting to ensure non-costly page allocation requests can -still be serviced without always needing to free up more than one -pageblock's worth of pages at a time. - -This has a noticeable effect on overall system latency when memory -pressure is elevated. The various mm functions which operate on -pageblocks no longer appear in the preemptoff tracer, where previously -they would spend up to 100 ms on a mobile arm64 CPU processing a -pageblock with preemption disabled and the zone lock held. - -Signed-off-by: Sultan Alsawaf ---- - include/linux/pageblock-flags.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h -index 5f1ae07d724b88..97cda629c9e909 100644 ---- a/include/linux/pageblock-flags.h -+++ b/include/linux/pageblock-flags.h -@@ -48,7 +48,7 @@ extern unsigned int pageblock_order; - #else /* CONFIG_HUGETLB_PAGE */ - - /* If huge pages are not used, group by MAX_ORDER_NR_PAGES */ --#define pageblock_order MAX_PAGE_ORDER -+#define pageblock_order PAGE_ALLOC_COSTLY_ORDER - - #endif /* CONFIG_HUGETLB_PAGE */ - - -From f22bc56be85e69c71c8e36041193856bb8b01525 Mon Sep 17 00:00:00 2001 -From: Sultan Alsawaf -Date: Wed, 20 Oct 2021 20:50:32 -0700 -Subject: [PATCH] ZEN: mm: Don't hog the CPU and zone lock in rmqueue_bulk() - -There is noticeable scheduling latency and heavy zone lock contention -stemming from rmqueue_bulk's single hold of the zone lock while doing -its work, as seen with the preemptoff tracer. There's no actual need for -rmqueue_bulk() to hold the zone lock the entire time; it only does so -for supposed efficiency. As such, we can relax the zone lock and even -reschedule when IRQs are enabled in order to keep the scheduling delays -and zone lock contention at bay. Forward progress is still guaranteed, -as the zone lock can only be relaxed after page removal. - -With this change, rmqueue_bulk() no longer appears as a serious offender -in the preemptoff tracer, and system latency is noticeably improved. - -Signed-off-by: Sultan Alsawaf ---- - mm/page_alloc.c | 23 ++++++++++++++++++----- - 1 file changed, 18 insertions(+), 5 deletions(-) - -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index a0b0397e29ee4c..87a983a356530c 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -3118,15 +3119,16 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, - } - - /* -- * Obtain a specified number of elements from the buddy allocator, all under -- * a single hold of the lock, for efficiency. Add them to the supplied list. -- * Returns the number of new pages which were placed at *list. -+ * Obtain a specified number of elements from the buddy allocator, and relax the -+ * zone lock when needed. Add them to the supplied list. Returns the number of -+ * new pages which were placed at *list. - */ - static int rmqueue_bulk(struct zone *zone, unsigned int order, - unsigned long count, struct list_head *list, - int migratetype, unsigned int alloc_flags) - { - unsigned long flags; -- int i; -+ const bool can_resched = !preempt_count() && !irqs_disabled(); -+ int i, allocated = 0, last_mod = 0; - - /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ - spin_lock(&zone->lock); -@@ -3137,6 +3138,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, - if (unlikely(page == NULL)) - break; - -+ /* Reschedule and ease the contention on the lock if needed */ -+ if (i + 1 < count && ((can_resched && need_resched()) || -+ spin_needbreak(&zone->lock))) { -+ __mod_zone_page_state(zone, NR_FREE_PAGES, -+ -((i + 1 - last_mod) << order)); -+ last_mod = i + 1; -+ spin_unlock(&zone->lock); -+ if (can_resched) -+ cond_resched(); -+ spin_lock(&zone->lock); -+ } -+ - if (unlikely(check_pcp_refill(page, order))) - continue; - -@@ -3163,7 +3176,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, - * on i. Do not confuse with 'allocated' which is the number of - * pages added to the pcp list. - */ -- __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); -+ __mod_zone_page_state(zone, NR_FREE_PAGES, -((i - last_mod) << order)); - spin_unlock(&zone->lock); - return allocated; - } - -From 6329525a0fa10cd13f39b76948b1296150f75c95 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Aug 2022 16:47:26 +0000 -Subject: [PATCH 14/16] XANMOD: Makefile: Disable GCC vectorization on trees - -Signed-off-by: Alexandre Frade ---- - Makefile | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/Makefile b/Makefile -index 3f6628780eb2..35a5ae1ede42 100644 ---- a/Makefile -+++ b/Makefile -@@ -1069,6 +1069,9 @@ endif - KBUILD_CFLAGS-$(call gcc-min-version, 90100) += -Wno-alloc-size-larger-than - KBUILD_CFLAGS += $(KBUILD_CFLAGS-y) $(CONFIG_CC_IMPLICIT_FALLTHROUGH) - -+# disable GCC vectorization on trees -+KBUILD_CFLAGS += $(call cc-option, -fno-tree-vectorize) -+ - # disable invalid "can't wrap" optimizations for signed / pointers - KBUILD_CFLAGS += -fno-strict-overflow - --- -2.39.1 - -From f997578464b2c4c63e7bd1afbfef56212ee44f2d Mon Sep 17 00:00:00 2001 -From: Etienne JUVIGNY -Date: Mon, 6 Mar 2023 13:54:09 +0100 -Subject: Don't add -dirty versioning on unclean trees - - -diff --git a/scripts/setlocalversion b/scripts/setlocalversion -index ca5795e16..ad0d94477 100755 ---- a/scripts/setlocalversion -+++ b/scripts/setlocalversion -@@ -85,12 +85,12 @@ scm_version() - # git-diff-index does not refresh the index, so it may give misleading - # results. - # See git-update-index(1), git-diff-index(1), and git-status(1). -- if { -- git --no-optional-locks status -uno --porcelain 2>/dev/null || -- git diff-index --name-only HEAD -- } | read dummy; then -- printf '%s' -dirty -- fi -+ #if { -+ # git --no-optional-locks status -uno --porcelain 2>/dev/null || -+ # git diff-index --name-only HEAD -+ #} | read dummy; then -+ # printf '%s' -dirty -+ #fi - } - - collect_files() - -From 1cf70fdd26245554ab30234722338d8160dff394 Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Sat, 21 May 2022 15:15:09 -0500 -Subject: [PATCH] ZEN: INTERACTIVE: dm-crypt: Disable workqueues for crypto ops - -Queueing in dm-crypt for crypto operations reduces performance on modern -systems. As discussed in an article from Cloudflare, they discovered -that queuing was introduced because the crypto subsystem used to be -synchronous. Since it's now asynchronous, we get double queueing when -using the subsystem through dm-crypt. This is obviously undesirable and -reduces throughput and increases latency. - -Disable queueing when using our Zen Interactive configuration. - -Fixes: https://github.com/zen-kernel/zen-kernel/issues/282 - -tkg: Config switch changed to our local "ZENIFY" toggle ---- - drivers/md/dm-crypt.c | 5 +++++ - init/Kconfig | 1 + - 2 files changed, 6 insertions(+) - -diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c -index 2ae8560b6a14ad..cb49218030c88b 100644 ---- a/drivers/md/dm-crypt.c -+++ b/drivers/md/dm-crypt.c -@@ -3242,6 +3242,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) - goto bad; - } - -+#ifdef CONFIG_ZENIFY -+ set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); -+ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); -+#endif -+ - ret = crypt_ctr_cipher(ti, argv[0], argv[1]); - if (ret < 0) - goto bad; diff --git a/patches/sys-kernel/gentoo-sources/0003-glitched-cfs.patch b/patches/sys-kernel/gentoo-sources/0003-glitched-cfs.patch deleted file mode 100644 index f0e4822..0000000 --- a/patches/sys-kernel/gentoo-sources/0003-glitched-cfs.patch +++ /dev/null @@ -1,117 +0,0 @@ -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (55) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (63) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) - -From cba31b19f8c38696b13ba48e0e8b6dbe747d6bae Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 17:31:25 +0000 -Subject: [PATCH 10/16] XANMOD: mm/vmscan: vm_swappiness = 30 decreases the - amount of swapping - -Signed-off-by: Alexandre Frade -Signed-off-by: Alexandre Frade ---- - mm/vmscan.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 5b7b8d4f5297..549684b29418 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -190,7 +190,7 @@ struct scan_control { - /* - * From 0 .. 200. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 30; - - static void set_task_reclaim_state(struct task_struct *task, - struct reclaim_state *rs) --- -2.39.1 diff --git a/patches/sys-kernel/gentoo-sources/0005-ksm.patch b/patches/sys-kernel/gentoo-sources/0005-ksm.patch deleted file mode 100644 index edc38c9..0000000 --- a/patches/sys-kernel/gentoo-sources/0005-ksm.patch +++ /dev/null @@ -1,433 +0,0 @@ -From 3d95a728e5570d4aedad19d699fd23db695f1ada Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Mon, 7 Oct 2024 11:53:40 +0200 -Subject: [PATCH 5/8] ksm - -Signed-off-by: Peter Jung ---- - arch/alpha/kernel/syscalls/syscall.tbl | 3 + - arch/arm/tools/syscall.tbl | 3 + - arch/m68k/kernel/syscalls/syscall.tbl | 3 + - arch/microblaze/kernel/syscalls/syscall.tbl | 3 + - arch/mips/kernel/syscalls/syscall_n32.tbl | 3 + - arch/mips/kernel/syscalls/syscall_n64.tbl | 3 + - arch/mips/kernel/syscalls/syscall_o32.tbl | 3 + - arch/parisc/kernel/syscalls/syscall.tbl | 3 + - arch/powerpc/kernel/syscalls/syscall.tbl | 3 + - arch/s390/kernel/syscalls/syscall.tbl | 3 + - arch/sh/kernel/syscalls/syscall.tbl | 3 + - arch/sparc/kernel/syscalls/syscall.tbl | 3 + - arch/x86/entry/syscalls/syscall_32.tbl | 3 + - arch/x86/entry/syscalls/syscall_64.tbl | 3 + - arch/xtensa/kernel/syscalls/syscall.tbl | 3 + - include/linux/syscalls.h | 3 + - include/uapi/asm-generic/unistd.h | 9 +- - kernel/sys.c | 138 ++++++++++++++++++ - kernel/sys_ni.c | 3 + - scripts/syscall.tbl | 3 + - .../arch/powerpc/entry/syscalls/syscall.tbl | 3 + - .../perf/arch/s390/entry/syscalls/syscall.tbl | 3 + - 22 files changed, 206 insertions(+), 1 deletion(-) - -diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl -index 74720667fe09..e6a11f3c0a2e 100644 ---- a/arch/alpha/kernel/syscalls/syscall.tbl -+++ b/arch/alpha/kernel/syscalls/syscall.tbl -@@ -502,3 +502,6 @@ - 570 common lsm_set_self_attr sys_lsm_set_self_attr - 571 common lsm_list_modules sys_lsm_list_modules - 572 common mseal sys_mseal -+573 common process_ksm_enable sys_process_ksm_enable -+574 common process_ksm_disable sys_process_ksm_disable -+575 common process_ksm_status sys_process_ksm_status -diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl -index 23c98203c40f..10a3099decbe 100644 ---- a/arch/arm/tools/syscall.tbl -+++ b/arch/arm/tools/syscall.tbl -@@ -477,3 +477,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status -diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl -index 22a3cbd4c602..12d2c7594bf0 100644 ---- a/arch/m68k/kernel/syscalls/syscall.tbl -+++ b/arch/m68k/kernel/syscalls/syscall.tbl -@@ -462,3 +462,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status -diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl -index 2b81a6bd78b2..e2a93c856eed 100644 ---- a/arch/microblaze/kernel/syscalls/syscall.tbl -+++ b/arch/microblaze/kernel/syscalls/syscall.tbl -@@ -468,3 +468,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status -diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl -index 953f5b7dc723..b921fbf56fa6 100644 ---- a/arch/mips/kernel/syscalls/syscall_n32.tbl -+++ b/arch/mips/kernel/syscalls/syscall_n32.tbl -@@ -401,3 +401,6 @@ - 460 n32 lsm_set_self_attr sys_lsm_set_self_attr - 461 n32 lsm_list_modules sys_lsm_list_modules - 462 n32 mseal sys_mseal -+463 n32 process_ksm_enable sys_process_ksm_enable -+464 n32 process_ksm_disable sys_process_ksm_disable -+465 n32 process_ksm_status sys_process_ksm_status -diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl -index 1464c6be6eb3..8d7f9ddd66f4 100644 ---- a/arch/mips/kernel/syscalls/syscall_n64.tbl -+++ b/arch/mips/kernel/syscalls/syscall_n64.tbl -@@ -377,3 +377,6 @@ - 460 n64 lsm_set_self_attr sys_lsm_set_self_attr - 461 n64 lsm_list_modules sys_lsm_list_modules - 462 n64 mseal sys_mseal -+463 n64 process_ksm_enable sys_process_ksm_enable -+464 n64 process_ksm_disable sys_process_ksm_disable -+465 n64 process_ksm_status sys_process_ksm_status -diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl -index 2439a2491cff..9d6142739954 100644 ---- a/arch/mips/kernel/syscalls/syscall_o32.tbl -+++ b/arch/mips/kernel/syscalls/syscall_o32.tbl -@@ -450,3 +450,6 @@ - 460 o32 lsm_set_self_attr sys_lsm_set_self_attr - 461 o32 lsm_list_modules sys_lsm_list_modules - 462 o32 mseal sys_mseal -+463 o32 process_ksm_enable sys_process_ksm_enable -+464 o32 process_ksm_disable sys_process_ksm_disable -+465 o32 process_ksm_status sys_process_ksm_status -diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl -index 66dc406b12e4..9d46476fd908 100644 ---- a/arch/parisc/kernel/syscalls/syscall.tbl -+++ b/arch/parisc/kernel/syscalls/syscall.tbl -@@ -461,3 +461,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status -diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl -index ebae8415dfbb..16f71bc2f6f0 100644 ---- a/arch/powerpc/kernel/syscalls/syscall.tbl -+++ b/arch/powerpc/kernel/syscalls/syscall.tbl -@@ -553,3 +553,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status -diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl -index 01071182763e..7394bad8178e 100644 ---- a/arch/s390/kernel/syscalls/syscall.tbl -+++ b/arch/s390/kernel/syscalls/syscall.tbl -@@ -465,3 +465,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status -diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl -index c55fd7696d40..b9fc31221b87 100644 ---- a/arch/sh/kernel/syscalls/syscall.tbl -+++ b/arch/sh/kernel/syscalls/syscall.tbl -@@ -466,3 +466,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status -diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl -index cfdfb3707c16..0d79fd772854 100644 ---- a/arch/sparc/kernel/syscalls/syscall.tbl -+++ b/arch/sparc/kernel/syscalls/syscall.tbl -@@ -508,3 +508,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status -diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl -index 534c74b14fab..c546a30575f1 100644 ---- a/arch/x86/entry/syscalls/syscall_32.tbl -+++ b/arch/x86/entry/syscalls/syscall_32.tbl -@@ -468,3 +468,6 @@ - 460 i386 lsm_set_self_attr sys_lsm_set_self_attr - 461 i386 lsm_list_modules sys_lsm_list_modules - 462 i386 mseal sys_mseal -+463 i386 process_ksm_enable sys_process_ksm_enable -+464 i386 process_ksm_disable sys_process_ksm_disable -+465 i386 process_ksm_status sys_process_ksm_status -diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl -index 7093ee21c0d1..0fcd10ba8dfe 100644 ---- a/arch/x86/entry/syscalls/syscall_64.tbl -+++ b/arch/x86/entry/syscalls/syscall_64.tbl -@@ -386,6 +386,9 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status - - # - # Due to a historical design error, certain syscalls are numbered differently -diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl -index 67083fc1b2f5..c1aecee4ad9b 100644 ---- a/arch/xtensa/kernel/syscalls/syscall.tbl -+++ b/arch/xtensa/kernel/syscalls/syscall.tbl -@@ -433,3 +433,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status -diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h -index 5758104921e6..cc9c4fac2412 100644 ---- a/include/linux/syscalls.h -+++ b/include/linux/syscalls.h -@@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); - asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, - size_t vlen, int behavior, unsigned int flags); - asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); -+asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags); -+asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags); -+asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags); - asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, - unsigned long prot, unsigned long pgoff, - unsigned long flags); -diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h -index 5bf6148cac2b..613e559ad6e0 100644 ---- a/include/uapi/asm-generic/unistd.h -+++ b/include/uapi/asm-generic/unistd.h -@@ -841,8 +841,15 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) - #define __NR_mseal 462 - __SYSCALL(__NR_mseal, sys_mseal) - -+#define __NR_process_ksm_enable 463 -+__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable) -+#define __NR_process_ksm_disable 464 -+__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable) -+#define __NR_process_ksm_status 465 -+__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status) -+ - #undef __NR_syscalls --#define __NR_syscalls 463 -+#define __NR_syscalls 466 - - /* - * 32 bit systems traditionally used different -diff --git a/kernel/sys.c b/kernel/sys.c -index 4da31f28fda8..fcd3aeaddd05 100644 ---- a/kernel/sys.c -+++ b/kernel/sys.c -@@ -2791,6 +2791,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, - return error; - } - -+#ifdef CONFIG_KSM -+enum pkc_action { -+ PKSM_ENABLE = 0, -+ PKSM_DISABLE, -+ PKSM_STATUS, -+}; -+ -+static long do_process_ksm_control(int pidfd, enum pkc_action action) -+{ -+ long ret; -+ struct task_struct *task; -+ struct mm_struct *mm; -+ unsigned int f_flags; -+ -+ task = pidfd_get_task(pidfd, &f_flags); -+ if (IS_ERR(task)) { -+ ret = PTR_ERR(task); -+ goto out; -+ } -+ -+ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ -+ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); -+ if (IS_ERR_OR_NULL(mm)) { -+ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; -+ goto release_task; -+ } -+ -+ /* Require CAP_SYS_NICE for influencing process performance. */ -+ if (!capable(CAP_SYS_NICE)) { -+ ret = -EPERM; -+ goto release_mm; -+ } -+ -+ if (mmap_write_lock_killable(mm)) { -+ ret = -EINTR; -+ goto release_mm; -+ } -+ -+ switch (action) { -+ case PKSM_ENABLE: -+ ret = ksm_enable_merge_any(mm); -+ break; -+ case PKSM_DISABLE: -+ ret = ksm_disable_merge_any(mm); -+ break; -+ case PKSM_STATUS: -+ ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags); -+ break; -+ } -+ -+ mmap_write_unlock(mm); -+ -+release_mm: -+ mmput(mm); -+release_task: -+ put_task_struct(task); -+out: -+ return ret; -+} -+#endif /* CONFIG_KSM */ -+ -+SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags) -+{ -+#ifdef CONFIG_KSM -+ if (flags != 0) -+ return -EINVAL; -+ -+ return do_process_ksm_control(pidfd, PKSM_ENABLE); -+#else /* CONFIG_KSM */ -+ return -ENOSYS; -+#endif /* CONFIG_KSM */ -+} -+ -+SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags) -+{ -+#ifdef CONFIG_KSM -+ if (flags != 0) -+ return -EINVAL; -+ -+ return do_process_ksm_control(pidfd, PKSM_DISABLE); -+#else /* CONFIG_KSM */ -+ return -ENOSYS; -+#endif /* CONFIG_KSM */ -+} -+ -+SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags) -+{ -+#ifdef CONFIG_KSM -+ if (flags != 0) -+ return -EINVAL; -+ -+ return do_process_ksm_control(pidfd, PKSM_STATUS); -+#else /* CONFIG_KSM */ -+ return -ENOSYS; -+#endif /* CONFIG_KSM */ -+} -+ -+#ifdef CONFIG_KSM -+static ssize_t process_ksm_enable_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", __NR_process_ksm_enable); -+} -+static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable); -+ -+static ssize_t process_ksm_disable_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", __NR_process_ksm_disable); -+} -+static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable); -+ -+static ssize_t process_ksm_status_show(struct kobject *kobj, -+ struct kobj_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%u\n", __NR_process_ksm_status); -+} -+static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status); -+ -+static struct attribute *process_ksm_sysfs_attrs[] = { -+ &process_ksm_enable_attr.attr, -+ &process_ksm_disable_attr.attr, -+ &process_ksm_status_attr.attr, -+ NULL, -+}; -+ -+static const struct attribute_group process_ksm_sysfs_attr_group = { -+ .attrs = process_ksm_sysfs_attrs, -+ .name = "process_ksm", -+}; -+ -+static int __init process_ksm_sysfs_init(void) -+{ -+ return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group); -+} -+subsys_initcall(process_ksm_sysfs_init); -+#endif /* CONFIG_KSM */ -+ - SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, - struct getcpu_cache __user *, unused) - { -diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c -index c00a86931f8c..d82213d68522 100644 ---- a/kernel/sys_ni.c -+++ b/kernel/sys_ni.c -@@ -186,6 +186,9 @@ COND_SYSCALL(mincore); - COND_SYSCALL(madvise); - COND_SYSCALL(process_madvise); - COND_SYSCALL(process_mrelease); -+COND_SYSCALL(process_ksm_enable); -+COND_SYSCALL(process_ksm_disable); -+COND_SYSCALL(process_ksm_status); - COND_SYSCALL(remap_file_pages); - COND_SYSCALL(mbind); - COND_SYSCALL(get_mempolicy); -diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl -index 845e24eb372e..227d9cc12365 100644 ---- a/scripts/syscall.tbl -+++ b/scripts/syscall.tbl -@@ -403,3 +403,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status -diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl -index ebae8415dfbb..16f71bc2f6f0 100644 ---- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl -+++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl -@@ -553,3 +553,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status -diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl -index 01071182763e..7394bad8178e 100644 ---- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl -+++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl -@@ -465,3 +465,6 @@ - 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr - 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules - 462 common mseal sys_mseal sys_mseal -+463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable -+464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable -+465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status --- -2.47.0.rc0 - diff --git a/patches/sys-kernel/gentoo-sources/0006-add-acs-overrides_iommu.patch b/patches/sys-kernel/gentoo-sources/0006-add-acs-overrides_iommu.patch deleted file mode 100644 index 562c51f..0000000 --- a/patches/sys-kernel/gentoo-sources/0006-add-acs-overrides_iommu.patch +++ /dev/null @@ -1,193 +0,0 @@ -From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 -From: Mark Weiman -Date: Sun, 12 Aug 2018 11:36:21 -0400 -Subject: [PATCH] pci: Enable overrides for missing ACS capabilities - -This an updated version of Alex Williamson's patch from: -https://lkml.org/lkml/2013/5/30/513 - -Original commit message follows: - -PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that -allows us to control whether transactions are allowed to be redirected -in various subnodes of a PCIe topology. For instance, if two -endpoints are below a root port or downsteam switch port, the -downstream port may optionally redirect transactions between the -devices, bypassing upstream devices. The same can happen internally -on multifunction devices. The transaction may never be visible to the -upstream devices. - -One upstream device that we particularly care about is the IOMMU. If -a redirection occurs in the topology below the IOMMU, then the IOMMU -cannot provide isolation between devices. This is why the PCIe spec -encourages topologies to include ACS support. Without it, we have to -assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. - -Unfortunately, far too many topologies do not support ACS to make this -a steadfast requirement. Even the latest chipsets from Intel are only -sporadically supporting ACS. We have trouble getting interconnect -vendors to include the PCIe spec required PCIe capability, let alone -suggested features. - -Therefore, we need to add some flexibility. The pcie_acs_override= -boot option lets users opt-in specific devices or sets of devices to -assume ACS support. The "downstream" option assumes full ACS support -on root ports and downstream switch ports. The "multifunction" -option assumes the subset of ACS features available on multifunction -endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" -option enables ACS support on devices matching the provided vendor -and device IDs, allowing more strategic ACS overrides. These options -may be combined in any order. A maximum of 16 id specific overrides -are available. It's suggested to use the most limited set of options -necessary to avoid completely disabling ACS across the topology. -Note to hardware vendors, we have facilities to permanently quirk -specific devices which enforce isolation but not provide an ACS -capability. Please contact me to have your devices added and save -your customers the hassle of this boot option. - -Signed-off-by: Mark Weiman ---- - .../admin-guide/kernel-parameters.txt | 9 ++ - drivers/pci/quirks.c | 101 ++++++++++++++++++ - 2 files changed, 110 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index aefd358a5ca3..173b3596fd9e 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -3190,6 +3190,15 @@ - nomsi [MSI] If the PCI_MSI kernel config parameter is - enabled, this kernel boot option can be used to - disable the use of MSI interrupts system-wide. -+ pcie_acs_override = -+ [PCIE] Override missing PCIe ACS support for: -+ downstream -+ All downstream ports - full ACS capabilities -+ multifunction -+ All multifunction devices - multifunction ACS subset -+ id:nnnn:nnnn -+ Specific device - full ACS capabilities -+ Specified as vid:did (vendor/device ID) in hex - noioapicquirk [APIC] Disable all boot interrupt quirks. - Safety option to keep boot IRQs enabled. This - should never be necessary. -diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 4700d24e5d55..8f7a3d7fd9c1 100644 ---- a/drivers/pci/quirks.c -+++ b/drivers/pci/quirks.c -@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) - dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; - } - -+static bool acs_on_downstream; -+static bool acs_on_multifunction; -+ -+#define NUM_ACS_IDS 16 -+struct acs_on_id { -+ unsigned short vendor; -+ unsigned short device; -+}; -+static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; -+static u8 max_acs_id; -+ -+static __init int pcie_acs_override_setup(char *p) -+{ -+ if (!p) -+ return -EINVAL; -+ -+ while (*p) { -+ if (!strncmp(p, "downstream", 10)) -+ acs_on_downstream = true; -+ if (!strncmp(p, "multifunction", 13)) -+ acs_on_multifunction = true; -+ if (!strncmp(p, "id:", 3)) { -+ char opt[5]; -+ int ret; -+ long val; -+ -+ if (max_acs_id >= NUM_ACS_IDS - 1) { -+ pr_warn("Out of PCIe ACS override slots (%d)\n", -+ NUM_ACS_IDS); -+ goto next; -+ } -+ -+ p += 3; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].vendor = val; -+ -+ p += strcspn(p, ":"); -+ if (*p != ':') { -+ pr_warn("PCIe ACS invalid ID\n"); -+ goto next; -+ } -+ -+ p++; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].device = val; -+ max_acs_id++; -+ } -+next: -+ p += strcspn(p, ","); -+ if (*p == ',') -+ p++; -+ } -+ -+ if (acs_on_downstream || acs_on_multifunction || max_acs_id) -+ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); -+ -+ return 0; -+} -+early_param("pcie_acs_override", pcie_acs_override_setup); -+ -+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) -+{ -+ int i; -+ -+ /* Never override ACS for legacy devices or devices with ACS caps */ -+ if (!pci_is_pcie(dev) || -+ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) -+ return -ENOTTY; -+ -+ for (i = 0; i < max_acs_id; i++) -+ if (acs_on_ids[i].vendor == dev->vendor && -+ acs_on_ids[i].device == dev->device) -+ return 1; -+ -+ switch (pci_pcie_type(dev)) { -+ case PCI_EXP_TYPE_DOWNSTREAM: -+ case PCI_EXP_TYPE_ROOT_PORT: -+ if (acs_on_downstream) -+ return 1; -+ break; -+ case PCI_EXP_TYPE_ENDPOINT: -+ case PCI_EXP_TYPE_UPSTREAM: -+ case PCI_EXP_TYPE_LEG_END: -+ case PCI_EXP_TYPE_RC_END: -+ if (acs_on_multifunction && dev->multifunction) -+ return 1; -+ } -+ -+ return -ENOTTY; -+} - /* - * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. - * The device will throw a Link Down error on AER-capable systems and -@@ -5102,6 +5102,7 @@ - { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, - /* Wangxun nics */ - { PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs }, -+ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, - { 0 } - }; - - diff --git a/patches/sys-kernel/gentoo-sources/0007-v6.12-fsync_legacy_via_futex_waitv.patch b/patches/sys-kernel/gentoo-sources/0007-v6.12-fsync_legacy_via_futex_waitv.patch deleted file mode 100644 index fdda084..0000000 --- a/patches/sys-kernel/gentoo-sources/0007-v6.12-fsync_legacy_via_futex_waitv.patch +++ /dev/null @@ -1,166 +0,0 @@ -From b70e738f08403950aa3053c36b98c6b0eeb0eb90 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Mon, 25 Oct 2021 09:49:42 -0300 -Subject: [PATCH] futex: Add entry point for FUTEX_WAIT_MULTIPLE (opcode 31) -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -Add an option to wait on multiple futexes using the old interface, that -uses opcode 31 through futex() syscall. Do that by just translation the -old interface to use the new code. This allows old and stable versions -of Proton to still use fsync in new kernel releases. - -Signed-off-by: André Almeida ---- - include/uapi/linux/futex.h | 13 +++++++ - kernel/futex/syscalls.c | 75 +++++++++++++++++++++++++++++++++++++- - 2 files changed, 87 insertions(+), 1 deletion(-) - -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index 71a5df8d2689..d375ab21cbf8 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -22,6 +22,7 @@ - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 - #define FUTEX_LOCK_PI2 13 -+#define FUTEX_WAIT_MULTIPLE 31 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -@@ -68,6 +69,18 @@ struct futex_waitv { - __u32 __reserved; - }; - -+/** -+ * struct futex_wait_block - Block of futexes to be waited for -+ * @uaddr: User address of the futex -+ * @val: Futex value expected by userspace -+ * @bitset: Bitset for the optional bitmasked wakeup -+ */ -+struct futex_wait_block { -+ __u32 __user *uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ - /* - * Support for robust futexes: the kernel cleans up held futexes at - * thread exit time. -diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c -index 6f91a07a6a83..2f4d4c04ede2 100644 ---- a/kernel/futex/syscalls.c -+++ b/kernel/futex/syscalls.c -@@ -158,6 +158,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd) - case FUTEX_LOCK_PI2: - case FUTEX_WAIT_BITSET: - case FUTEX_WAIT_REQUEUE_PI: -+ case FUTEX_WAIT_MULTIPLE: - return true; - } - return false; -@@ -170,13 +171,79 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t) - return -EINVAL; - - *t = timespec64_to_ktime(*ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - *t = ktime_add_safe(ktime_get(), *t); - else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME)) - *t = timens_ktime_to_host(CLOCK_MONOTONIC, *t); - return 0; - } - -+/** -+ * futex_read_wait_block - Read an array of futex_wait_block from userspace -+ * @uaddr: Userspace address of the block -+ * @count: Number of blocks to be read -+ * -+ * This function creates and allocate an array of futex_q (we zero it to -+ * initialize the fields) and then, for each futex_wait_block element from -+ * userspace, fill a futex_q element with proper values. -+ */ -+inline struct futex_vector *futex_read_wait_block(u32 __user *uaddr, u32 count) -+{ -+ unsigned int i; -+ struct futex_vector *futexv; -+ struct futex_wait_block fwb; -+ struct futex_wait_block __user *entry = -+ (struct futex_wait_block __user *)uaddr; -+ -+ if (!count || count > FUTEX_WAITV_MAX) -+ return ERR_PTR(-EINVAL); -+ -+ futexv = kcalloc(count, sizeof(*futexv), GFP_KERNEL); -+ if (!futexv) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i = 0; i < count; i++) { -+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { -+ kfree(futexv); -+ return ERR_PTR(-EFAULT); -+ } -+ -+ futexv[i].w.flags = FUTEX_32; -+ futexv[i].w.val = fwb.val; -+ futexv[i].w.uaddr = (uintptr_t) (fwb.uaddr); -+ futexv[i].q = futex_q_init; -+ } -+ -+ return futexv; -+} -+ -+int futex_wait_multiple(struct futex_vector *vs, unsigned int count, -+ struct hrtimer_sleeper *to); -+ -+int futex_opcode_31(ktime_t *abs_time, u32 __user *uaddr, int count) -+{ -+ int ret; -+ struct futex_vector *vs; -+ struct hrtimer_sleeper *to = NULL, timeout; -+ -+ to = futex_setup_timer(abs_time, &timeout, 0, 0); -+ -+ vs = futex_read_wait_block(uaddr, count); -+ -+ if (IS_ERR(vs)) -+ return PTR_ERR(vs); -+ -+ ret = futex_wait_multiple(vs, count, abs_time ? to : NULL); -+ kfree(vs); -+ -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ -+ return ret; -+} -+ - SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - const struct __kernel_timespec __user *, utime, - u32 __user *, uaddr2, u32, val3) -@@ -196,6 +263,9 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - tp = &t; - } - -+ if (cmd == FUTEX_WAIT_MULTIPLE) -+ return futex_opcode_31(tp, uaddr, val); -+ - return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); - } - -@@ -392,6 +462,9 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - tp = &t; - } - -+ if (cmd == FUTEX_WAIT_MULTIPLE) -+ return futex_opcode_31(tp, uaddr, val); -+ - return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3); - } - #endif /* CONFIG_COMPAT_32BIT_TIME */ --- -2.33.1 - diff --git a/patches/sys-kernel/gentoo-sources/0008-zstd.patch b/patches/sys-kernel/gentoo-sources/0008-zstd.patch deleted file mode 100644 index a32a7bc..0000000 --- a/patches/sys-kernel/gentoo-sources/0008-zstd.patch +++ /dev/null @@ -1,18652 +0,0 @@ -From 9342b92704267791d2b57a24f3a0db84c5f00912 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Mon, 7 Oct 2024 11:54:11 +0200 -Subject: [PATCH 8/8] zstd - -Signed-off-by: Peter Jung ---- - include/linux/zstd.h | 2 +- - include/linux/zstd_errors.h | 23 +- - include/linux/zstd_lib.h | 850 +++++-- - lib/zstd/Makefile | 2 +- - lib/zstd/common/allocations.h | 56 + - lib/zstd/common/bits.h | 149 ++ - lib/zstd/common/bitstream.h | 127 +- - lib/zstd/common/compiler.h | 134 +- - lib/zstd/common/cpu.h | 3 +- - lib/zstd/common/debug.c | 9 +- - lib/zstd/common/debug.h | 34 +- - lib/zstd/common/entropy_common.c | 42 +- - lib/zstd/common/error_private.c | 12 +- - lib/zstd/common/error_private.h | 84 +- - lib/zstd/common/fse.h | 94 +- - lib/zstd/common/fse_decompress.c | 130 +- - lib/zstd/common/huf.h | 237 +- - lib/zstd/common/mem.h | 3 +- - lib/zstd/common/portability_macros.h | 28 +- - lib/zstd/common/zstd_common.c | 38 +- - lib/zstd/common/zstd_deps.h | 16 +- - lib/zstd/common/zstd_internal.h | 109 +- - lib/zstd/compress/clevels.h | 3 +- - lib/zstd/compress/fse_compress.c | 74 +- - lib/zstd/compress/hist.c | 3 +- - lib/zstd/compress/hist.h | 3 +- - lib/zstd/compress/huf_compress.c | 441 ++-- - lib/zstd/compress/zstd_compress.c | 2111 ++++++++++++----- - lib/zstd/compress/zstd_compress_internal.h | 359 ++- - lib/zstd/compress/zstd_compress_literals.c | 155 +- - lib/zstd/compress/zstd_compress_literals.h | 25 +- - lib/zstd/compress/zstd_compress_sequences.c | 7 +- - lib/zstd/compress/zstd_compress_sequences.h | 3 +- - lib/zstd/compress/zstd_compress_superblock.c | 376 ++- - lib/zstd/compress/zstd_compress_superblock.h | 3 +- - lib/zstd/compress/zstd_cwksp.h | 169 +- - lib/zstd/compress/zstd_double_fast.c | 143 +- - lib/zstd/compress/zstd_double_fast.h | 17 +- - lib/zstd/compress/zstd_fast.c | 596 +++-- - lib/zstd/compress/zstd_fast.h | 6 +- - lib/zstd/compress/zstd_lazy.c | 732 +++--- - lib/zstd/compress/zstd_lazy.h | 138 +- - lib/zstd/compress/zstd_ldm.c | 21 +- - lib/zstd/compress/zstd_ldm.h | 3 +- - lib/zstd/compress/zstd_ldm_geartab.h | 3 +- - lib/zstd/compress/zstd_opt.c | 497 ++-- - lib/zstd/compress/zstd_opt.h | 41 +- - lib/zstd/decompress/huf_decompress.c | 887 ++++--- - lib/zstd/decompress/zstd_ddict.c | 9 +- - lib/zstd/decompress/zstd_ddict.h | 3 +- - lib/zstd/decompress/zstd_decompress.c | 358 ++- - lib/zstd/decompress/zstd_decompress_block.c | 708 +++--- - lib/zstd/decompress/zstd_decompress_block.h | 10 +- - .../decompress/zstd_decompress_internal.h | 9 +- - lib/zstd/decompress_sources.h | 2 +- - lib/zstd/zstd_common_module.c | 5 +- - lib/zstd/zstd_compress_module.c | 2 +- - lib/zstd/zstd_decompress_module.c | 4 +- - 58 files changed, 6577 insertions(+), 3531 deletions(-) - create mode 100644 lib/zstd/common/allocations.h - create mode 100644 lib/zstd/common/bits.h - -diff --git a/include/linux/zstd.h b/include/linux/zstd.h -index b2c7cf310c8f..ac59ae9a18d7 100644 ---- a/include/linux/zstd.h -+++ b/include/linux/zstd.h -@@ -1,6 +1,6 @@ - /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h -index 58b6dd45a969..6d5cf55f0bf3 100644 ---- a/include/linux/zstd_errors.h -+++ b/include/linux/zstd_errors.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -17,8 +18,17 @@ - - - /* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ --#define ZSTDERRORLIB_VISIBILITY --#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY -+#define ZSTDERRORLIB_VISIBLE -+ -+#ifndef ZSTDERRORLIB_HIDDEN -+# if (__GNUC__ >= 4) && !defined(__MINGW32__) -+# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) -+# else -+# define ZSTDERRORLIB_HIDDEN -+# endif -+#endif -+ -+#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE - - /*-********************************************* - * Error codes list -@@ -43,14 +53,17 @@ typedef enum { - ZSTD_error_frameParameter_windowTooLarge = 16, - ZSTD_error_corruption_detected = 20, - ZSTD_error_checksum_wrong = 22, -+ ZSTD_error_literals_headerWrong = 24, - ZSTD_error_dictionary_corrupted = 30, - ZSTD_error_dictionary_wrong = 32, - ZSTD_error_dictionaryCreation_failed = 34, - ZSTD_error_parameter_unsupported = 40, -+ ZSTD_error_parameter_combination_unsupported = 41, - ZSTD_error_parameter_outOfBound = 42, - ZSTD_error_tableLog_tooLarge = 44, - ZSTD_error_maxSymbolValue_tooLarge = 46, - ZSTD_error_maxSymbolValue_tooSmall = 48, -+ ZSTD_error_stabilityCondition_notRespected = 50, - ZSTD_error_stage_wrong = 60, - ZSTD_error_init_missing = 62, - ZSTD_error_memory_allocation = 64, -@@ -58,11 +71,15 @@ typedef enum { - ZSTD_error_dstSize_tooSmall = 70, - ZSTD_error_srcSize_wrong = 72, - ZSTD_error_dstBuffer_null = 74, -+ ZSTD_error_noForwardProgress_destFull = 80, -+ ZSTD_error_noForwardProgress_inputEmpty = 82, - /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ - ZSTD_error_frameIndex_tooLarge = 100, - ZSTD_error_seekableIO = 102, - ZSTD_error_dstBuffer_wrong = 104, - ZSTD_error_srcBuffer_wrong = 105, -+ ZSTD_error_sequenceProducer_failed = 106, -+ ZSTD_error_externalSequences_invalid = 107, - ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ - } ZSTD_ErrorCode; - -diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h -index 79d55465d5c1..6320fedcf8a4 100644 ---- a/include/linux/zstd_lib.h -+++ b/include/linux/zstd_lib.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -11,23 +12,42 @@ - #ifndef ZSTD_H_235446 - #define ZSTD_H_235446 - --/* ====== Dependency ======*/ -+/* ====== Dependencies ======*/ - #include /* INT_MAX */ - #include /* size_t */ - - - /* ===== ZSTDLIB_API : control library symbols visibility ===== */ --#ifndef ZSTDLIB_VISIBLE -+#define ZSTDLIB_VISIBLE -+ -+#ifndef ZSTDLIB_HIDDEN - # if (__GNUC__ >= 4) && !defined(__MINGW32__) --# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) - # define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) - # else --# define ZSTDLIB_VISIBLE - # define ZSTDLIB_HIDDEN - # endif - #endif -+ - #define ZSTDLIB_API ZSTDLIB_VISIBLE - -+/* Deprecation warnings : -+ * Should these warnings be a problem, it is generally possible to disable them, -+ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. -+ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. -+ */ -+#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS -+# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */ -+#else -+# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) -+# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message))) -+# elif (__GNUC__ >= 3) -+# define ZSTD_DEPRECATED(message) __attribute__((deprecated)) -+# else -+# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") -+# define ZSTD_DEPRECATED(message) -+# endif -+#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ -+ - - /* ***************************************************************************** - Introduction -@@ -65,7 +85,7 @@ - /*------ Version ------*/ - #define ZSTD_VERSION_MAJOR 1 - #define ZSTD_VERSION_MINOR 5 --#define ZSTD_VERSION_RELEASE 2 -+#define ZSTD_VERSION_RELEASE 6 - #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) - - /*! ZSTD_versionNumber() : -@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void); - ***************************************/ - /*! ZSTD_compress() : - * Compresses `src` content as a single zstd compressed frame into already allocated `dst`. -- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. -+ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have -+ * enough space to successfully compress the data. - * @return : compressed size written into `dst` (<= `dstCapacity), - * or an error code if it fails (which can be tested using ZSTD_isError()). */ - ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, -@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t - * "empty", "unknown" and "error" results to the same return value (0), - * while ZSTD_getFrameContentSize() gives them separate return values. - * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ --ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); -+ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize") -+ZSTDLIB_API -+unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); - - /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ - * `src` should point to the start of a ZSTD frame or skippable frame. -@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) - - - /*====== Helper functions ======*/ --#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ --ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ -+/* ZSTD_compressBound() : -+ * maximum compressed size in worst case single-pass scenario. -+ * When invoking `ZSTD_compress()` or any other one-pass compression function, -+ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize) -+ * as it eliminates one potential failure scenario, -+ * aka not enough room in dst buffer to write the compressed frame. -+ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE . -+ * In which case, ZSTD_compressBound() will return an error code -+ * which can be tested using ZSTD_isError(). -+ * -+ * ZSTD_COMPRESSBOUND() : -+ * same as ZSTD_compressBound(), but as a macro. -+ * It can be used to produce constants, which can be useful for static allocation, -+ * for example to size a static array on stack. -+ * Will produce constant value 0 if srcSize too large. -+ */ -+#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U) -+#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ -+ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ -+/* ZSTD_isError() : -+ * Most ZSTD_* functions returning a size_t value can be tested for error, -+ * using ZSTD_isError(). -+ * @return 1 if error, 0 otherwise -+ */ - ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ - ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ - ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ -@@ -183,7 +228,7 @@ ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compres - /*= Compression context - * When compressing many times, - * it is recommended to allocate a context just once, -- * and re-use it for each successive compression operation. -+ * and reuse it for each successive compression operation. - * This will make workload friendlier for system's memory. - * Note : re-using context is just a speed / resource optimization. - * It doesn't change the compression ratio, which remains identical. -@@ -196,9 +241,9 @@ ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* accept NULL pointer * - - /*! ZSTD_compressCCtx() : - * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. -- * Important : in order to behave similarly to `ZSTD_compress()`, -- * this function compresses at requested compression level, -- * __ignoring any other parameter__ . -+ * Important : in order to mirror `ZSTD_compress()` behavior, -+ * this function compresses at the requested compression level, -+ * __ignoring any other advanced parameter__ . - * If any advanced parameter was set using the advanced API, - * they will all be reset. Only `compressionLevel` remains. - */ -@@ -210,7 +255,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, - /*= Decompression context - * When decompressing many times, - * it is recommended to allocate a context only once, -- * and re-use it for each successive compression operation. -+ * and reuse it for each successive compression operation. - * This will make workload friendlier for system's memory. - * Use one context per thread for parallel execution. */ - typedef struct ZSTD_DCtx_s ZSTD_DCtx; -@@ -220,7 +265,7 @@ ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /* accept NULL pointer * - /*! ZSTD_decompressDCtx() : - * Same as ZSTD_decompress(), - * requires an allocated ZSTD_DCtx. -- * Compatible with sticky parameters. -+ * Compatible with sticky parameters (see below). - */ - ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, - void* dst, size_t dstCapacity, -@@ -236,12 +281,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, - * using ZSTD_CCtx_set*() functions. - * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. - * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! -- * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . -+ * __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ . - * - * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). - * - * This API supersedes all other "advanced" API entry points in the experimental section. -- * In the future, we expect to remove from experimental API entry points which are redundant with this API. -+ * In the future, we expect to remove API entry points from experimental which are redundant with this API. - */ - - -@@ -324,6 +369,19 @@ typedef enum { - * The higher the value of selected strategy, the more complex it is, - * resulting in stronger and slower compression. - * Special: value 0 means "use default strategy". */ -+ -+ ZSTD_c_targetCBlockSize=130, /* v1.5.6+ -+ * Attempts to fit compressed block size into approximatively targetCBlockSize. -+ * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX. -+ * Note that it's not a guarantee, just a convergence target (default:0). -+ * No target when targetCBlockSize == 0. -+ * This is helpful in low bandwidth streaming environments to improve end-to-end latency, -+ * when a client can make use of partial documents (a prominent example being Chrome). -+ * Note: this parameter is stable since v1.5.6. -+ * It was present as an experimental parameter in earlier versions, -+ * but it's not recommended using it with earlier library versions -+ * due to massive performance regressions. -+ */ - /* LDM mode parameters */ - ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. - * This parameter is designed to improve compression ratio -@@ -403,7 +461,6 @@ typedef enum { - * ZSTD_c_forceMaxWindow - * ZSTD_c_forceAttachDict - * ZSTD_c_literalCompressionMode -- * ZSTD_c_targetCBlockSize - * ZSTD_c_srcSizeHint - * ZSTD_c_enableDedicatedDictSearch - * ZSTD_c_stableInBuffer -@@ -412,6 +469,9 @@ typedef enum { - * ZSTD_c_validateSequences - * ZSTD_c_useBlockSplitter - * ZSTD_c_useRowMatchFinder -+ * ZSTD_c_prefetchCDictTables -+ * ZSTD_c_enableSeqProducerFallback -+ * ZSTD_c_maxBlockSize - * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. - * note : never ever use experimentalParam? names directly; - * also, the enums values themselves are unstable and can still change. -@@ -421,7 +481,7 @@ typedef enum { - ZSTD_c_experimentalParam3=1000, - ZSTD_c_experimentalParam4=1001, - ZSTD_c_experimentalParam5=1002, -- ZSTD_c_experimentalParam6=1003, -+ /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */ - ZSTD_c_experimentalParam7=1004, - ZSTD_c_experimentalParam8=1005, - ZSTD_c_experimentalParam9=1006, -@@ -430,7 +490,11 @@ typedef enum { - ZSTD_c_experimentalParam12=1009, - ZSTD_c_experimentalParam13=1010, - ZSTD_c_experimentalParam14=1011, -- ZSTD_c_experimentalParam15=1012 -+ ZSTD_c_experimentalParam15=1012, -+ ZSTD_c_experimentalParam16=1013, -+ ZSTD_c_experimentalParam17=1014, -+ ZSTD_c_experimentalParam18=1015, -+ ZSTD_c_experimentalParam19=1016 - } ZSTD_cParameter; - - typedef struct { -@@ -493,7 +557,7 @@ typedef enum { - * They will be used to compress next frame. - * Resetting session never fails. - * - The parameters : changes all parameters back to "default". -- * This removes any reference to any dictionary too. -+ * This also removes any reference to any dictionary or external sequence producer. - * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) - * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) - * - Both : similar to resetting the session, followed by resetting parameters. -@@ -502,11 +566,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); - - /*! ZSTD_compress2() : - * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. -+ * (note that this entry point doesn't even expose a compression level parameter). - * ZSTD_compress2() always starts a new frame. - * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. - * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() - * - The function is always blocking, returns when compression is completed. -- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. -+ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have -+ * enough space to successfully compress the data, though it is possible it fails for other reasons. - * @return : compressed size written into `dst` (<= `dstCapacity), - * or an error code if it fails (which can be tested using ZSTD_isError()). - */ -@@ -543,13 +609,17 @@ typedef enum { - * ZSTD_d_stableOutBuffer - * ZSTD_d_forceIgnoreChecksum - * ZSTD_d_refMultipleDDicts -+ * ZSTD_d_disableHuffmanAssembly -+ * ZSTD_d_maxBlockSize - * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. - * note : never ever use experimentalParam? names directly - */ - ZSTD_d_experimentalParam1=1000, - ZSTD_d_experimentalParam2=1001, - ZSTD_d_experimentalParam3=1002, -- ZSTD_d_experimentalParam4=1003 -+ ZSTD_d_experimentalParam4=1003, -+ ZSTD_d_experimentalParam5=1004, -+ ZSTD_d_experimentalParam6=1005 - - } ZSTD_dParameter; - -@@ -604,14 +674,14 @@ typedef struct ZSTD_outBuffer_s { - * A ZSTD_CStream object is required to track streaming operation. - * Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. - * ZSTD_CStream objects can be reused multiple times on consecutive compression operations. --* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. -+* It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. - * - * For parallel execution, use one separate ZSTD_CStream per thread. - * - * note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. - * - * Parameters are sticky : when starting a new compression on the same context, --* it will re-use the same sticky parameters as previous compression session. -+* it will reuse the same sticky parameters as previous compression session. - * When in doubt, it's recommended to fully initialize the context before usage. - * Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), - * ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to -@@ -700,6 +770,11 @@ typedef enum { - * only ZSTD_e_end or ZSTD_e_flush operations are allowed. - * Before starting a new compression job, or changing compression parameters, - * it is required to fully flush internal buffers. -+ * - note: if an operation ends with an error, it may leave @cctx in an undefined state. -+ * Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state. -+ * In order to be re-employed after an error, a state must be reset, -+ * which can be done explicitly (ZSTD_CCtx_reset()), -+ * or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx()) - */ - ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, - ZSTD_outBuffer* output, -@@ -728,8 +803,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output - * This following is a legacy streaming API, available since v1.0+ . - * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). - * It is redundant, but remains fully supported. -- * Streaming in combination with advanced parameters and dictionary compression -- * can only be used through the new API. - ******************************************************************************/ - - /*! -@@ -738,6 +811,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output - * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); - * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) - * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); -+ * -+ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API -+ * to compress with a dictionary. - */ - ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); - /*! -@@ -758,7 +834,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); - * - * A ZSTD_DStream object is required to track streaming operations. - * Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. --* ZSTD_DStream objects can be re-used multiple times. -+* ZSTD_DStream objects can be reused multiple times. - * - * Use ZSTD_initDStream() to start a new decompression operation. - * @return : recommended first input size -@@ -788,13 +864,37 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer - - /*===== Streaming decompression functions =====*/ - --/* This function is redundant with the advanced API and equivalent to: -+/*! ZSTD_initDStream() : -+ * Initialize/reset DStream state for new decompression operation. -+ * Call before new decompression operation using same DStream. - * -+ * Note : This function is redundant with the advanced API and equivalent to: - * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); - * ZSTD_DCtx_refDDict(zds, NULL); - */ - ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); - -+/*! ZSTD_decompressStream() : -+ * Streaming decompression function. -+ * Call repetitively to consume full input updating it as necessary. -+ * Function will update both input and output `pos` fields exposing current state via these fields: -+ * - `input.pos < input.size`, some input remaining and caller should provide remaining input -+ * on the next call. -+ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers. -+ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers, -+ * call ZSTD_decompressStream() again to flush remaining data to output. -+ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX. -+ * -+ * @return : 0 when a frame is completely decoded and fully flushed, -+ * or an error code, which can be tested using ZSTD_isError(), -+ * or any other value > 0, which means there is some decoding or flushing to do to complete current frame. -+ * -+ * Note: when an operation returns with an error code, the @zds state may be left in undefined state. -+ * It's UB to invoke `ZSTD_decompressStream()` on such a state. -+ * In order to re-use such a state, it must be first reset, -+ * which can be done explicitly (`ZSTD_DCtx_reset()`), -+ * or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`) -+ */ - ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); - - ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ -@@ -913,7 +1013,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); - * If @return == 0, the dictID could not be decoded. - * This could for one of the following reasons : - * - The frame does not require a dictionary to be decoded (most common case). -- * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. -+ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information. - * Note : this use case also happens when using a non-conformant dictionary. - * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). - * - This is not a Zstandard frame. -@@ -925,9 +1025,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); - * Advanced dictionary and prefix API (Requires v1.4.0+) - * - * This API allows dictionaries to be used with ZSTD_compress2(), -- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and -- * only reset with the context is reset with ZSTD_reset_parameters or -- * ZSTD_reset_session_and_parameters. Prefixes are single-use. -+ * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). -+ * Dictionaries are sticky, they remain valid when same context is reused, -+ * they only reset when the context is reset -+ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters. -+ * In contrast, Prefixes are single-use. - ******************************************************************************/ - - -@@ -937,8 +1039,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); - * @result : 0, or an error code (which can be tested with ZSTD_isError()). - * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, - * meaning "return to no-dictionary mode". -- * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. -- * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). -+ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames, -+ * until parameters are reset, a new dictionary is loaded, or the dictionary -+ * is explicitly invalidated by loading a NULL dictionary. - * Note 2 : Loading a dictionary involves building tables. - * It's also a CPU consuming operation, with non-negligible impact on latency. - * Tables are dependent on compression parameters, and for this reason, -@@ -947,11 +1050,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); - * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. - * In such a case, dictionary buffer must outlive its users. - * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() -- * to precisely select how dictionary content must be interpreted. */ -+ * to precisely select how dictionary content must be interpreted. -+ * Note 5 : This method does not benefit from LDM (long distance mode). -+ * If you want to employ LDM on some large dictionary content, -+ * prefer employing ZSTD_CCtx_refPrefix() described below. -+ */ - ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); - - /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ -- * Reference a prepared dictionary, to be used for all next compressed frames. -+ * Reference a prepared dictionary, to be used for all future compressed frames. - * Note that compression parameters are enforced from within CDict, - * and supersede any compression parameter previously set within CCtx. - * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. -@@ -970,6 +1077,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); - * Decompression will need same prefix to properly regenerate data. - * Compressing with a prefix is similar in outcome as performing a diff and compressing it, - * but performs much faster, especially during decompression (compression speed is tunable with compression level). -+ * This method is compatible with LDM (long distance mode). - * @result : 0, or an error code (which can be tested with ZSTD_isError()). - * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary - * Note 1 : Prefix buffer is referenced. It **must** outlive compression. -@@ -986,9 +1094,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, - const void* prefix, size_t prefixSize); - - /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ -- * Create an internal DDict from dict buffer, -- * to be used to decompress next frames. -- * The dictionary remains valid for all future frames, until explicitly invalidated. -+ * Create an internal DDict from dict buffer, to be used to decompress all future frames. -+ * The dictionary remains valid for all future frames, until explicitly invalidated, or -+ * a new dictionary is loaded. - * @result : 0, or an error code (which can be tested with ZSTD_isError()). - * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, - * meaning "return to no-dictionary mode". -@@ -1012,9 +1120,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s - * The memory for the table is allocated on the first call to refDDict, and can be - * freed with ZSTD_freeDCtx(). - * -+ * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary -+ * will be managed, and referencing a dictionary effectively "discards" any previous one. -+ * - * @result : 0, or an error code (which can be tested with ZSTD_isError()). -- * Note 1 : Currently, only one dictionary can be managed. -- * Referencing a new dictionary effectively "discards" any previous one. - * Special: referencing a NULL DDict means "return to no-dictionary mode". - * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. - */ -@@ -1071,24 +1180,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); - #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE - #endif - --/* Deprecation warnings : -- * Should these warnings be a problem, it is generally possible to disable them, -- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. -- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. -- */ --#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS --# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */ --#else --# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) --# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message))) --# elif (__GNUC__ >= 3) --# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated)) --# else --# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") --# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API --# endif --#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ -- - /* ************************************************************************************** - * experimental API (static linking only) - **************************************************************************************** -@@ -1123,6 +1214,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); - #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ - #define ZSTD_STRATEGY_MIN ZSTD_fast - #define ZSTD_STRATEGY_MAX ZSTD_btultra2 -+#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */ - - - #define ZSTD_OVERLAPLOG_MIN 0 -@@ -1146,7 +1238,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); - #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN) - - /* Advanced parameter bounds */ --#define ZSTD_TARGETCBLOCKSIZE_MIN 64 -+#define ZSTD_TARGETCBLOCKSIZE_MIN 1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */ - #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX - #define ZSTD_SRCSIZEHINT_MIN 0 - #define ZSTD_SRCSIZEHINT_MAX INT_MAX -@@ -1303,7 +1395,7 @@ typedef enum { - } ZSTD_paramSwitch_e; - - /* ************************************* --* Frame size functions -+* Frame header and size functions - ***************************************/ - - /*! ZSTD_findDecompressedSize() : -@@ -1350,29 +1442,122 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size - * or an error code (if srcSize is too small) */ - ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); - -+typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; -+typedef struct { -+ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ -+ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ -+ unsigned blockSizeMax; -+ ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ -+ unsigned headerSize; -+ unsigned dictID; -+ unsigned checksumFlag; -+ unsigned _reserved1; -+ unsigned _reserved2; -+} ZSTD_frameHeader; -+ -+/*! ZSTD_getFrameHeader() : -+ * decode Frame Header, or requires larger `srcSize`. -+ * @return : 0, `zfhPtr` is correctly filled, -+ * >0, `srcSize` is too small, value is wanted `srcSize` amount, -+ * or an error code, which can be tested using ZSTD_isError() */ -+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ -+/*! ZSTD_getFrameHeader_advanced() : -+ * same as ZSTD_getFrameHeader(), -+ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ -+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); -+ -+/*! ZSTD_decompressionMargin() : -+ * Zstd supports in-place decompression, where the input and output buffers overlap. -+ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large, -+ * and the input buffer must be at the end of the output buffer. -+ * -+ * _______________________ Output Buffer ________________________ -+ * | | -+ * | ____ Input Buffer ____| -+ * | | | -+ * v v v -+ * |---------------------------------------|-----------|----------| -+ * ^ ^ ^ -+ * |___________________ Output_Size ___________________|_ Margin _| -+ * -+ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN(). -+ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or -+ * ZSTD_decompressDCtx(). -+ * NOTE: This function supports multi-frame input. -+ * -+ * @param src The compressed frame(s) -+ * @param srcSize The size of the compressed frame(s) -+ * @returns The decompression margin or an error that can be checked with ZSTD_isError(). -+ */ -+ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize); -+ -+/*! ZSTD_DECOMPRESS_MARGIN() : -+ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from -+ * the compressed frame, compute it from the original size and the blockSizeLog. -+ * See ZSTD_decompressionMargin() for details. -+ * -+ * WARNING: This macro does not support multi-frame input, the input must be a single -+ * zstd frame. If you need that support use the function, or implement it yourself. -+ * -+ * @param originalSize The original uncompressed size of the data. -+ * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX). -+ * Unless you explicitly set the windowLog smaller than -+ * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX. -+ */ -+#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \ -+ ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \ -+ 4 /* checksum */ + \ -+ ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \ -+ (blockSize) /* One block of margin */ \ -+ )) -+ - typedef enum { - ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ - ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ - } ZSTD_sequenceFormat_e; - -+/*! ZSTD_sequenceBound() : -+ * `srcSize` : size of the input buffer -+ * @return : upper-bound for the number of sequences that can be generated -+ * from a buffer of srcSize bytes -+ * -+ * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence). -+ */ -+ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); -+ - /*! ZSTD_generateSequences() : -- * Generate sequences using ZSTD_compress2, given a source buffer. -+ * WARNING: This function is meant for debugging and informational purposes ONLY! -+ * Its implementation is flawed, and it will be deleted in a future version. -+ * It is not guaranteed to succeed, as there are several cases where it will give -+ * up and fail. You should NOT use this function in production code. -+ * -+ * This function is deprecated, and will be removed in a future version. -+ * -+ * Generate sequences using ZSTD_compress2(), given a source buffer. -+ * -+ * @param zc The compression context to be used for ZSTD_compress2(). Set any -+ * compression parameters you need on this context. -+ * @param outSeqs The output sequences buffer of size @p outSeqsSize -+ * @param outSeqsSize The size of the output sequences buffer. -+ * ZSTD_sequenceBound(srcSize) is an upper bound on the number -+ * of sequences that can be generated. -+ * @param src The source buffer to generate sequences from of size @p srcSize. -+ * @param srcSize The size of the source buffer. - * - * Each block will end with a dummy sequence - * with offset == 0, matchLength == 0, and litLength == length of last literals. - * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) - * simply acts as a block delimiter. - * -- * zc can be used to insert custom compression params. -- * This function invokes ZSTD_compress2 -- * -- * The output of this function can be fed into ZSTD_compressSequences() with CCtx -- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters -- * @return : number of sequences generated -+ * @returns The number of sequences generated, necessarily less than -+ * ZSTD_sequenceBound(srcSize), or an error code that can be checked -+ * with ZSTD_isError(). - */ -- --ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, -- size_t outSeqsSize, const void* src, size_t srcSize); -+ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()") -+ZSTDLIB_STATIC_API size_t -+ZSTD_generateSequences(ZSTD_CCtx* zc, -+ ZSTD_Sequence* outSeqs, size_t outSeqsSize, -+ const void* src, size_t srcSize); - - /*! ZSTD_mergeBlockDelimiters() : - * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals -@@ -1388,7 +1573,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o - ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); - - /*! ZSTD_compressSequences() : -- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. -+ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst. -+ * @src contains the entire input (not just the literals). -+ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals - * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) - * The entire source is compressed into a single frame. - * -@@ -1413,11 +1600,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si - * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. - * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, - * and cannot emit an RLE block that disagrees with the repcode history -- * @return : final compressed size or a ZSTD error. -+ * @return : final compressed size, or a ZSTD error code. - */ --ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, -- const ZSTD_Sequence* inSeqs, size_t inSeqsSize, -- const void* src, size_t srcSize); -+ZSTDLIB_STATIC_API size_t -+ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize, -+ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, -+ const void* src, size_t srcSize); - - - /*! ZSTD_writeSkippableFrame() : -@@ -1464,48 +1652,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); - /*! ZSTD_estimate*() : - * These functions make it possible to estimate memory usage - * of a future {D,C}Ctx, before its creation. -+ * This is useful in combination with ZSTD_initStatic(), -+ * which makes it possible to employ a static buffer for ZSTD_CCtx* state. - * - * ZSTD_estimateCCtxSize() will provide a memory budget large enough -- * for any compression level up to selected one. -- * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate -- * does not include space for a window buffer. -- * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. -+ * to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2() -+ * associated with any compression level up to max specified one. - * The estimate will assume the input may be arbitrarily large, - * which is the worst case. - * -+ * Note that the size estimation is specific for one-shot compression, -+ * it is not valid for streaming (see ZSTD_estimateCStreamSize*()) -+ * nor other potential ways of using a ZSTD_CCtx* state. -+ * - * When srcSize can be bound by a known and rather "small" value, -- * this fact can be used to provide a tighter estimation -- * because the CCtx compression context will need less memory. -- * This tighter estimation can be provided by more advanced functions -+ * this knowledge can be used to provide a tighter budget estimation -+ * because the ZSTD_CCtx* state will need less memory for small inputs. -+ * This tighter estimation can be provided by employing more advanced functions - * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), - * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). - * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. - * -- * Note 2 : only single-threaded compression is supported. -+ * Note : only single-threaded compression is supported. - * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. - */ --ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); -+ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel); - ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); - ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); - ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); - - /*! ZSTD_estimateCStreamSize() : -- * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. -- * It will also consider src size to be arbitrarily "large", which is worst case. -+ * ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression -+ * using any compression level up to the max specified one. -+ * It will also consider src size to be arbitrarily "large", which is a worst case scenario. - * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. - * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. - * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. - * Note : CStream size estimation is only correct for single-threaded compression. -- * ZSTD_DStream memory budget depends on window Size. -+ * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. -+ * Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. -+ * Size estimates assume that no external sequence producer is registered. -+ * -+ * ZSTD_DStream memory budget depends on frame's window Size. - * This information can be passed manually, using ZSTD_estimateDStreamSize, - * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); -+ * Any frame requesting a window size larger than max specified one will be rejected. - * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), - * an internal ?Dict will be created, which additional size is not estimated here. -- * In this case, get total size by adding ZSTD_estimate?DictSize */ --ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); -+ * In this case, get total size by adding ZSTD_estimate?DictSize -+ */ -+ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel); - ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); - ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); --ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize); -+ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize); - ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); - - /*! ZSTD_estimate?DictSize() : -@@ -1649,22 +1848,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); - * This function never fails (wide contract) */ - ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); - -+/*! ZSTD_CCtx_setCParams() : -+ * Set all parameters provided within @p cparams into the working @p cctx. -+ * Note : if modifying parameters during compression (MT mode only), -+ * note that changes to the .windowLog parameter will be ignored. -+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). -+ * On failure, no parameters are updated. -+ */ -+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams); -+ -+/*! ZSTD_CCtx_setFParams() : -+ * Set all parameters provided within @p fparams into the working @p cctx. -+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). -+ */ -+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams); -+ -+/*! ZSTD_CCtx_setParams() : -+ * Set all parameters provided within @p params into the working @p cctx. -+ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). -+ */ -+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params); -+ - /*! ZSTD_compress_advanced() : - * Note : this function is now DEPRECATED. - * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. - * This prototype will generate compilation warnings. */ - ZSTD_DEPRECATED("use ZSTD_compress2") -+ZSTDLIB_STATIC_API - size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, -- void* dst, size_t dstCapacity, -- const void* src, size_t srcSize, -- const void* dict,size_t dictSize, -- ZSTD_parameters params); -+ void* dst, size_t dstCapacity, -+ const void* src, size_t srcSize, -+ const void* dict,size_t dictSize, -+ ZSTD_parameters params); - - /*! ZSTD_compress_usingCDict_advanced() : - * Note : this function is now DEPRECATED. - * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. - * This prototype will generate compilation warnings. */ - ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") -+ZSTDLIB_STATIC_API - size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, -@@ -1737,11 +1959,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo - */ - #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 - --/* Tries to fit compressed block size to be around targetCBlockSize. -- * No target when targetCBlockSize == 0. -- * There is no guarantee on compressed block size (default:0) */ --#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 -- - /* User's best guess of source size. - * Hint is not valid when srcSizeHint == 0. - * There is no guarantee that hint is close to actual source size, -@@ -1808,13 +2025,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo - * Experimental parameter. - * Default is 0 == disabled. Set to 1 to enable. - * -- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same -- * between calls, except for the modifications that zstd makes to pos (the -- * caller must not modify pos). This is checked by the compressor, and -- * compression will fail if it ever changes. This means the only flush -- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end -- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos) -- * MUST not be modified during compression or you will get data corruption. -+ * Tells the compressor that input data presented with ZSTD_inBuffer -+ * will ALWAYS be the same between calls. -+ * Technically, the @src pointer must never be changed, -+ * and the @pos field can only be updated by zstd. -+ * However, it's possible to increase the @size field, -+ * allowing scenarios where more data can be appended after compressions starts. -+ * These conditions are checked by the compressor, -+ * and compression will fail if they are not respected. -+ * Also, data in the ZSTD_inBuffer within the range [src, src + pos) -+ * MUST not be modified during compression or it will result in data corruption. - * - * When this flag is enabled zstd won't allocate an input window buffer, - * because the user guarantees it can reference the ZSTD_inBuffer until -@@ -1822,18 +2042,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo - * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also - * avoid the memcpy() from the input buffer to the input window buffer. - * -- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used. -- * That means this flag cannot be used with ZSTD_compressStream(). -- * - * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using - * this flag is ALWAYS memory safe, and will never access out-of-bounds -- * memory. However, compression WILL fail if you violate the preconditions. -+ * memory. However, compression WILL fail if conditions are not respected. - * -- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST -- * not be modified during compression or you will get data corruption. This -- * is because zstd needs to reference data in the ZSTD_inBuffer to find -+ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST -+ * not be modified during compression or it will result in data corruption. -+ * This is because zstd needs to reference data in the ZSTD_inBuffer to find - * matches. Normally zstd maintains its own window buffer for this purpose, -- * but passing this flag tells zstd to use the user provided buffer. -+ * but passing this flag tells zstd to rely on user provided buffer instead. - */ - #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 - -@@ -1878,7 +2095,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo - * Without validation, providing a sequence that does not conform to the zstd spec will cause - * undefined behavior, and may produce a corrupted block. - * -- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for -+ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for - * specifics regarding offset/matchlength requirements) then the function will bail out and - * return an error. - * -@@ -1928,6 +2145,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo - */ - #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 - -+/* ZSTD_c_prefetchCDictTables -+ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto. -+ * -+ * In some situations, zstd uses CDict tables in-place rather than copying them -+ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details). -+ * In such situations, compression speed is seriously impacted when CDict tables are -+ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables -+ * when they are used in-place. -+ * -+ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit. -+ * For sufficiently large inputs, zstd will by default memcpy() CDict tables -+ * into the working context, so there is no need to prefetch. This parameter is -+ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be -+ * useful but memcpy() is too expensive. The exact range of input sizes where this -+ * makes sense is best determined by careful experimentation. -+ * -+ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable, -+ * but in the future zstd may conditionally enable this feature via an auto-detection -+ * heuristic for cold CDicts. -+ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances. -+ */ -+#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 -+ -+/* ZSTD_c_enableSeqProducerFallback -+ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0. -+ * -+ * Controls whether zstd will fall back to an internal sequence producer if an -+ * external sequence producer is registered and returns an error code. This fallback -+ * is block-by-block: the internal sequence producer will only be called for blocks -+ * where the external sequence producer returns an error code. Fallback parsing will -+ * follow any other cParam settings, such as compression level, the same as in a -+ * normal (fully-internal) compression operation. -+ * -+ * The user is strongly encouraged to read the full Block-Level Sequence Producer API -+ * documentation (below) before setting this parameter. */ -+#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17 -+ -+/* ZSTD_c_maxBlockSize -+ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). -+ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. -+ * -+ * This parameter can be used to set an upper bound on the blocksize -+ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper -+ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make -+ * compressBound() inaccurate). Only currently meant to be used for testing. -+ * -+ */ -+#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 -+ -+/* ZSTD_c_searchForExternalRepcodes -+ * This parameter affects how zstd parses external sequences, such as sequences -+ * provided through the compressSequences() API or from an external block-level -+ * sequence producer. -+ * -+ * If set to ZSTD_ps_enable, the library will check for repeated offsets in -+ * external sequences, even if those repcodes are not explicitly indicated in -+ * the "rep" field. Note that this is the only way to exploit repcode matches -+ * while using compressSequences() or an external sequence producer, since zstd -+ * currently ignores the "rep" field of external sequences. -+ * -+ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in -+ * external sequences, regardless of whether the "rep" field has been set. This -+ * reduces sequence compression overhead by about 25% while sacrificing some -+ * compression ratio. -+ * -+ * The default value is ZSTD_ps_auto, for which the library will enable/disable -+ * based on compression level. -+ * -+ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is -+ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future. -+ */ -+#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 -+ - /*! ZSTD_CCtx_getParameter() : - * Get the requested compression parameter value, selected by enum ZSTD_cParameter, - * and store it into int* value. -@@ -2084,7 +2374,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete - * in the range [dst, dst + pos) MUST not be modified during decompression - * or you will get data corruption. - * -- * When this flags is enabled zstd won't allocate an output buffer, because -+ * When this flag is enabled zstd won't allocate an output buffer, because - * it can write directly to the ZSTD_outBuffer, but it will still allocate - * an input buffer large enough to fit any compressed block. This will also - * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. -@@ -2137,6 +2427,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete - */ - #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 - -+/* ZSTD_d_disableHuffmanAssembly -+ * Set to 1 to disable the Huffman assembly implementation. -+ * The default value is 0, which allows zstd to use the Huffman assembly -+ * implementation if available. -+ * -+ * This parameter can be used to disable Huffman assembly at runtime. -+ * If you want to disable it at compile time you can define the macro -+ * ZSTD_DISABLE_ASM. -+ */ -+#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 -+ -+/* ZSTD_d_maxBlockSize -+ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). -+ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. -+ * -+ * Forces the decompressor to reject blocks whose content size is -+ * larger than the configured maxBlockSize. When maxBlockSize is -+ * larger than the windowSize, the windowSize is used instead. -+ * This saves memory on the decoder when you know all blocks are small. -+ * -+ * This option is typically used in conjunction with ZSTD_c_maxBlockSize. -+ * -+ * WARNING: This causes the decoder to reject otherwise valid frames -+ * that have block sizes larger than the configured maxBlockSize. -+ */ -+#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6 -+ - - /*! ZSTD_DCtx_setFormat() : - * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). -@@ -2145,6 +2462,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete - * such ZSTD_f_zstd1_magicless for example. - * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ - ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") -+ZSTDLIB_STATIC_API - size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); - - /*! ZSTD_decompressStream_simpleArgs() : -@@ -2181,6 +2499,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( - * This prototype will generate compilation warnings. - */ - ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") -+ZSTDLIB_STATIC_API - size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, - int compressionLevel, - unsigned long long pledgedSrcSize); -@@ -2198,17 +2517,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, - * This prototype will generate compilation warnings. - */ - ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") -+ZSTDLIB_STATIC_API - size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, - const void* dict, size_t dictSize, - int compressionLevel); - - /*! ZSTD_initCStream_advanced() : -- * This function is DEPRECATED, and is approximately equivalent to: -+ * This function is DEPRECATED, and is equivalent to: - * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); -- * // Pseudocode: Set each zstd parameter and leave the rest as-is. -- * for ((param, value) : params) { -- * ZSTD_CCtx_setParameter(zcs, param, value); -- * } -+ * ZSTD_CCtx_setParams(zcs, params); - * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); - * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); - * -@@ -2218,6 +2535,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, - * This prototype will generate compilation warnings. - */ - ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") -+ZSTDLIB_STATIC_API - size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, - const void* dict, size_t dictSize, - ZSTD_parameters params, -@@ -2232,15 +2550,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, - * This prototype will generate compilation warnings. - */ - ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") -+ZSTDLIB_STATIC_API - size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); - - /*! ZSTD_initCStream_usingCDict_advanced() : -- * This function is DEPRECATED, and is approximately equivalent to: -+ * This function is DEPRECATED, and is equivalent to: - * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); -- * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. -- * for ((fParam, value) : fParams) { -- * ZSTD_CCtx_setParameter(zcs, fParam, value); -- * } -+ * ZSTD_CCtx_setFParams(zcs, fParams); - * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); - * ZSTD_CCtx_refCDict(zcs, cdict); - * -@@ -2250,6 +2566,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); - * This prototype will generate compilation warnings. - */ - ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") -+ZSTDLIB_STATIC_API - size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, - const ZSTD_CDict* cdict, - ZSTD_frameParameters fParams, -@@ -2264,7 +2581,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, - * explicitly specified. - * - * start a new frame, using same parameters from previous frame. -- * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. -+ * This is typically useful to skip dictionary loading stage, since it will reuse it in-place. - * Note that zcs must be init at least once before using ZSTD_resetCStream(). - * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. - * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. -@@ -2274,6 +2591,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, - * This prototype will generate compilation warnings. - */ - ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") -+ZSTDLIB_STATIC_API - size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); - - -@@ -2319,8 +2637,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); - * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); - * - * note: no dictionary will be used if dict == NULL or dictSize < 8 -- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x - */ -+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions") - ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); - - /*! -@@ -2330,8 +2648,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo - * ZSTD_DCtx_refDDict(zds, ddict); - * - * note : ddict is referenced, it must outlive decompression session -- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x - */ -+ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions") - ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); - - /*! -@@ -2339,18 +2657,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z - * - * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); - * -- * re-use decompression parameters from previous init; saves dictionary loading -- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x -+ * reuse decompression parameters from previous init; saves dictionary loading - */ -+ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions") - ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); - - -+/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API ********************* -+ * -+ * *** OVERVIEW *** -+ * The Block-Level Sequence Producer API allows users to provide their own custom -+ * sequence producer which libzstd invokes to process each block. The produced list -+ * of sequences (literals and matches) is then post-processed by libzstd to produce -+ * valid compressed blocks. -+ * -+ * This block-level offload API is a more granular complement of the existing -+ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers -+ * an easier migration story for applications already integrated with libzstd: the -+ * user application continues to invoke the same compression functions -+ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits -+ * from the specific advantages of the external sequence producer. For example, -+ * the sequence producer could be tuned to take advantage of known characteristics -+ * of the input, to offer better speed / ratio, or could leverage hardware -+ * acceleration not available within libzstd itself. -+ * -+ * See contrib/externalSequenceProducer for an example program employing the -+ * Block-Level Sequence Producer API. -+ * -+ * *** USAGE *** -+ * The user is responsible for implementing a function of type -+ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following -+ * arguments to the user-provided function: -+ * -+ * - sequenceProducerState: a pointer to a user-managed state for the sequence -+ * producer. -+ * -+ * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer. -+ * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory -+ * backing outSeqs is managed by the CCtx. -+ * -+ * - src, srcSize: an input buffer for the sequence producer to parse. -+ * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX. -+ * -+ * - dict, dictSize: a history buffer, which may be empty, which the sequence -+ * producer may reference as it parses the src buffer. Currently, zstd will -+ * always pass dictSize == 0 into external sequence producers, but this will -+ * change in the future. -+ * -+ * - compressionLevel: a signed integer representing the zstd compression level -+ * set by the user for the current operation. The sequence producer may choose -+ * to use this information to change its compression strategy and speed/ratio -+ * tradeoff. Note: the compression level does not reflect zstd parameters set -+ * through the advanced API. -+ * -+ * - windowSize: a size_t representing the maximum allowed offset for external -+ * sequences. Note that sequence offsets are sometimes allowed to exceed the -+ * windowSize if a dictionary is present, see doc/zstd_compression_format.md -+ * for details. -+ * -+ * The user-provided function shall return a size_t representing the number of -+ * sequences written to outSeqs. This return value will be treated as an error -+ * code if it is greater than outSeqsCapacity. The return value must be non-zero -+ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided -+ * for convenience, but any value greater than outSeqsCapacity will be treated as -+ * an error code. -+ * -+ * If the user-provided function does not return an error code, the sequences -+ * written to outSeqs must be a valid parse of the src buffer. Data corruption may -+ * occur if the parse is not valid. A parse is defined to be valid if the -+ * following conditions hold: -+ * - The sum of matchLengths and literalLengths must equal srcSize. -+ * - All sequences in the parse, except for the final sequence, must have -+ * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have -+ * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0. -+ * - All offsets must respect the windowSize parameter as specified in -+ * doc/zstd_compression_format.md. -+ * - If the final sequence has matchLength == 0, it must also have offset == 0. -+ * -+ * zstd will only validate these conditions (and fail compression if they do not -+ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence -+ * validation has a performance cost. -+ * -+ * If the user-provided function returns an error, zstd will either fall back -+ * to an internal sequence producer or fail the compression operation. The user can -+ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback -+ * cParam. Fallback compression will follow any other cParam settings, such as -+ * compression level, the same as in a normal compression operation. -+ * -+ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F -+ * function by calling -+ * ZSTD_registerSequenceProducer(cctx, -+ * sequenceProducerState, -+ * sequenceProducer) -+ * This setting will persist until the next parameter reset of the CCtx. -+ * -+ * The sequenceProducerState must be initialized by the user before calling -+ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the -+ * sequenceProducerState. -+ * -+ * *** LIMITATIONS *** -+ * This API is compatible with all zstd compression APIs which respect advanced parameters. -+ * However, there are three limitations: -+ * -+ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported. -+ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level -+ * external sequence producer. -+ * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some -+ * cases (see its documentation for details). Users must explicitly set -+ * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external -+ * sequence producer is registered. -+ * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default -+ * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should -+ * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence -+ * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog). -+ * -+ * Second, history buffers are not currently supported. Concretely, zstd will always pass -+ * dictSize == 0 to the external sequence producer (for now). This has two implications: -+ * - Dictionaries are not currently supported. Compression will *not* fail if the user -+ * references a dictionary, but the dictionary won't have any effect. -+ * - Stream history is not currently supported. All advanced compression APIs, including -+ * streaming APIs, work with external sequence producers, but each block is treated as -+ * an independent chunk without history from previous blocks. -+ * -+ * Third, multi-threading within a single compression is not currently supported. In other words, -+ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered. -+ * Multi-threading across compressions is fine: simply create one CCtx per thread. -+ * -+ * Long-term, we plan to overcome all three limitations. There is no technical blocker to -+ * overcoming them. It is purely a question of engineering effort. -+ */ -+ -+#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1)) -+ -+typedef size_t (*ZSTD_sequenceProducer_F) ( -+ void* sequenceProducerState, -+ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, -+ const void* src, size_t srcSize, -+ const void* dict, size_t dictSize, -+ int compressionLevel, -+ size_t windowSize -+); -+ -+/*! ZSTD_registerSequenceProducer() : -+ * Instruct zstd to use a block-level external sequence producer function. -+ * -+ * The sequenceProducerState must be initialized by the caller, and the caller is -+ * responsible for managing its lifetime. This parameter is sticky across -+ * compressions. It will remain set until the user explicitly resets compression -+ * parameters. -+ * -+ * Sequence producer registration is considered to be an "advanced parameter", -+ * part of the "advanced API". This means it will only have an effect on compression -+ * APIs which respect advanced parameters, such as compress2() and compressStream2(). -+ * Older compression APIs such as compressCCtx(), which predate the introduction of -+ * "advanced parameters", will ignore any external sequence producer setting. -+ * -+ * The sequence producer can be "cleared" by registering a NULL function pointer. This -+ * removes all limitations described above in the "LIMITATIONS" section of the API docs. -+ * -+ * The user is strongly encouraged to read the full API documentation (above) before -+ * calling this function. */ -+ZSTDLIB_STATIC_API void -+ZSTD_registerSequenceProducer( -+ ZSTD_CCtx* cctx, -+ void* sequenceProducerState, -+ ZSTD_sequenceProducer_F sequenceProducer -+); -+ -+/*! ZSTD_CCtxParams_registerSequenceProducer() : -+ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params. -+ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(), -+ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx(). -+ * -+ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx() -+ * is required, then this function is for you. Otherwise, you probably don't need it. -+ * -+ * See tests/zstreamtest.c for example usage. */ -+ZSTDLIB_STATIC_API void -+ZSTD_CCtxParams_registerSequenceProducer( -+ ZSTD_CCtx_params* params, -+ void* sequenceProducerState, -+ ZSTD_sequenceProducer_F sequenceProducer -+); -+ -+ - /* ******************************************************************* --* Buffer-less and synchronous inner streaming functions -+* Buffer-less and synchronous inner streaming functions (DEPRECATED) -+* -+* This API is deprecated, and will be removed in a future version. -+* It allows streaming (de)compression with user allocated buffers. -+* However, it is hard to use, and not as well tested as the rest of -+* our API. - * --* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. --* But it's also a complex one, with several restrictions, documented below. --* Prefer normal streaming API for an easier experience. -+* Please use the normal streaming API instead: ZSTD_compressStream2, -+* and ZSTD_decompressStream. -+* If there is functionality that you need, but it doesn't provide, -+* please open an issue on our GitHub. - ********************************************************************* */ - - /* -@@ -2358,11 +2860,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); - - A ZSTD_CCtx object is required to track streaming operations. - Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. -- ZSTD_CCtx object can be re-used multiple times within successive compression operations. -+ ZSTD_CCtx object can be reused multiple times within successive compression operations. - - Start by initializing a context. - Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. -- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() - - Then, consume your input using ZSTD_compressContinue(). - There are some important considerations to keep in mind when using this advanced function : -@@ -2380,36 +2881,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); - It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. - Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. - -- `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. -+ `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again. - */ - - /*===== Buffer-less streaming compression functions =====*/ -+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") - ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); -+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") - ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); -+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") - ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ --ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ - -+ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") -+ZSTDLIB_STATIC_API -+size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ -+ -+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") - ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -+ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") - ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); - - /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ - ZSTD_DEPRECATED("use advanced API to access custom parameters") -+ZSTDLIB_STATIC_API - size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ - ZSTD_DEPRECATED("use advanced API to access custom parameters") -+ZSTDLIB_STATIC_API - size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ - /* - Buffer-less streaming decompression (synchronous mode) - - A ZSTD_DCtx object is required to track streaming operations. - Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. -- A ZSTD_DCtx object can be re-used multiple times. -+ A ZSTD_DCtx object can be reused multiple times. - - First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). - Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. - Data fragment must be large enough to ensure successful decoding. - `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. -- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. -- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. -+ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. -+ >0 : `srcSize` is too small, please provide at least result bytes on next attempt. - errorCode, which can be tested using ZSTD_isError(). - - It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, -@@ -2428,7 +2939,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ - - The most memory efficient way is to use a round buffer of sufficient size. - Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), -- which can @return an error code if required value is too large for current system (in 32-bits mode). -+ which can return an error code if required value is too large for current system (in 32-bits mode). - In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, - up to the moment there is not enough room left in the buffer to guarantee decoding another full block, - which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. -@@ -2448,7 +2959,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ - ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). - ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. - -- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). -+ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). - It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. - It can also be an error code, which can be tested with ZSTD_isError(). - -@@ -2471,27 +2982,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ - */ - - /*===== Buffer-less streaming decompression functions =====*/ --typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; --typedef struct { -- unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ -- unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ -- unsigned blockSizeMax; -- ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ -- unsigned headerSize; -- unsigned dictID; -- unsigned checksumFlag; --} ZSTD_frameHeader; - --/*! ZSTD_getFrameHeader() : -- * decode Frame Header, or requires larger `srcSize`. -- * @return : 0, `zfhPtr` is correctly filled, -- * >0, `srcSize` is too small, value is wanted `srcSize` amount, -- * or an error code, which can be tested using ZSTD_isError() */ --ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ --/*! ZSTD_getFrameHeader_advanced() : -- * same as ZSTD_getFrameHeader(), -- * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ --ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); - ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ - - ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); -@@ -2502,6 +2993,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); - ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); - - /* misc */ -+ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") - ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); - typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; - ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); -@@ -2509,11 +3001,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); - - - --/* ============================ */ --/* Block level API */ --/* ============================ */ -+/* ========================================= */ -+/* Block level API (DEPRECATED) */ -+/* ========================================= */ - - /*! -+ -+ This API is deprecated in favor of the regular compression API. -+ You can get the frame header down to 2 bytes by setting: -+ - ZSTD_c_format = ZSTD_f_zstd1_magicless -+ - ZSTD_c_contentSizeFlag = 0 -+ - ZSTD_c_checksumFlag = 0 -+ - ZSTD_c_dictIDFlag = 0 -+ -+ This API is not as well tested as our normal API, so we recommend not using it. -+ We will be removing it in a future version. If the normal API doesn't provide -+ the functionality you need, please open a GitHub issue. -+ - Block functions produce and decode raw zstd blocks, without frame metadata. - Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). - But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. -@@ -2524,7 +3028,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); - - It is necessary to init context before starting - + compression : any ZSTD_compressBegin*() variant, including with dictionary - + decompression : any ZSTD_decompressBegin*() variant, including with dictionary -- + copyCCtx() and copyDCtx() can be used too - - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB - + If input is larger than a block size, it's necessary to split input data into multiple blocks - + For inputs larger than a single block, consider using regular ZSTD_compress() instead. -@@ -2541,11 +3044,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); - */ - - /*===== Raw zstd block functions =====*/ -+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") - ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); -+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") - ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") - ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -+ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") - ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ - -- - #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ - -diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile -index 20f08c644b71..464c410b2768 100644 ---- a/lib/zstd/Makefile -+++ b/lib/zstd/Makefile -@@ -1,6 +1,6 @@ - # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - # ################################################################ --# Copyright (c) Facebook, Inc. -+# Copyright (c) Meta Platforms, Inc. and affiliates. - # All rights reserved. - # - # This source code is licensed under both the BSD-style license (found in the -diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h -new file mode 100644 -index 000000000000..16c3d08e8d1a ---- /dev/null -+++ b/lib/zstd/common/allocations.h -@@ -0,0 +1,56 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ -+/* -+ * Copyright (c) Meta Platforms, Inc. and affiliates. -+ * All rights reserved. -+ * -+ * This source code is licensed under both the BSD-style license (found in the -+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found -+ * in the COPYING file in the root directory of this source tree). -+ * You may select, at your option, one of the above-listed licenses. -+ */ -+ -+/* This file provides custom allocation primitives -+ */ -+ -+#define ZSTD_DEPS_NEED_MALLOC -+#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ -+ -+#include "compiler.h" /* MEM_STATIC */ -+#define ZSTD_STATIC_LINKING_ONLY -+#include /* ZSTD_customMem */ -+ -+#ifndef ZSTD_ALLOCATIONS_H -+#define ZSTD_ALLOCATIONS_H -+ -+/* custom memory allocation functions */ -+ -+MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) -+{ -+ if (customMem.customAlloc) -+ return customMem.customAlloc(customMem.opaque, size); -+ return ZSTD_malloc(size); -+} -+ -+MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) -+{ -+ if (customMem.customAlloc) { -+ /* calloc implemented as malloc+memset; -+ * not as efficient as calloc, but next best guess for custom malloc */ -+ void* const ptr = customMem.customAlloc(customMem.opaque, size); -+ ZSTD_memset(ptr, 0, size); -+ return ptr; -+ } -+ return ZSTD_calloc(1, size); -+} -+ -+MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) -+{ -+ if (ptr!=NULL) { -+ if (customMem.customFree) -+ customMem.customFree(customMem.opaque, ptr); -+ else -+ ZSTD_free(ptr); -+ } -+} -+ -+#endif /* ZSTD_ALLOCATIONS_H */ -diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h -new file mode 100644 -index 000000000000..aa3487ec4b6a ---- /dev/null -+++ b/lib/zstd/common/bits.h -@@ -0,0 +1,149 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ -+/* -+ * Copyright (c) Meta Platforms, Inc. and affiliates. -+ * All rights reserved. -+ * -+ * This source code is licensed under both the BSD-style license (found in the -+ * LICENSE file in the root directory of this source tree) and the GPLv2 (found -+ * in the COPYING file in the root directory of this source tree). -+ * You may select, at your option, one of the above-listed licenses. -+ */ -+ -+#ifndef ZSTD_BITS_H -+#define ZSTD_BITS_H -+ -+#include "mem.h" -+ -+MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val) -+{ -+ assert(val != 0); -+ { -+ static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, -+ 30, 22, 20, 15, 25, 17, 4, 8, -+ 31, 27, 13, 23, 21, 19, 16, 7, -+ 26, 12, 18, 6, 11, 5, 10, 9}; -+ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27]; -+ } -+} -+ -+MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) -+{ -+ assert(val != 0); -+# if (__GNUC__ >= 4) -+ return (unsigned)__builtin_ctz(val); -+# else -+ return ZSTD_countTrailingZeros32_fallback(val); -+# endif -+} -+ -+MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) { -+ assert(val != 0); -+ { -+ static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29, -+ 11, 14, 16, 18, 22, 25, 3, 30, -+ 8, 12, 20, 28, 15, 17, 24, 7, -+ 19, 27, 23, 6, 26, 5, 4, 31}; -+ val |= val >> 1; -+ val |= val >> 2; -+ val |= val >> 4; -+ val |= val >> 8; -+ val |= val >> 16; -+ return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27]; -+ } -+} -+ -+MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) -+{ -+ assert(val != 0); -+# if (__GNUC__ >= 4) -+ return (unsigned)__builtin_clz(val); -+# else -+ return ZSTD_countLeadingZeros32_fallback(val); -+# endif -+} -+ -+MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) -+{ -+ assert(val != 0); -+# if (__GNUC__ >= 4) && defined(__LP64__) -+ return (unsigned)__builtin_ctzll(val); -+# else -+ { -+ U32 mostSignificantWord = (U32)(val >> 32); -+ U32 leastSignificantWord = (U32)val; -+ if (leastSignificantWord == 0) { -+ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord); -+ } else { -+ return ZSTD_countTrailingZeros32(leastSignificantWord); -+ } -+ } -+# endif -+} -+ -+MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) -+{ -+ assert(val != 0); -+# if (__GNUC__ >= 4) -+ return (unsigned)(__builtin_clzll(val)); -+# else -+ { -+ U32 mostSignificantWord = (U32)(val >> 32); -+ U32 leastSignificantWord = (U32)val; -+ if (mostSignificantWord == 0) { -+ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord); -+ } else { -+ return ZSTD_countLeadingZeros32(mostSignificantWord); -+ } -+ } -+# endif -+} -+ -+MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) -+{ -+ if (MEM_isLittleEndian()) { -+ if (MEM_64bits()) { -+ return ZSTD_countTrailingZeros64((U64)val) >> 3; -+ } else { -+ return ZSTD_countTrailingZeros32((U32)val) >> 3; -+ } -+ } else { /* Big Endian CPU */ -+ if (MEM_64bits()) { -+ return ZSTD_countLeadingZeros64((U64)val) >> 3; -+ } else { -+ return ZSTD_countLeadingZeros32((U32)val) >> 3; -+ } -+ } -+} -+ -+MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ -+{ -+ assert(val != 0); -+ return 31 - ZSTD_countLeadingZeros32(val); -+} -+ -+/* ZSTD_rotateRight_*(): -+ * Rotates a bitfield to the right by "count" bits. -+ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts -+ */ -+MEM_STATIC -+U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { -+ assert(count < 64); -+ count &= 0x3F; /* for fickle pattern recognition */ -+ return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); -+} -+ -+MEM_STATIC -+U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { -+ assert(count < 32); -+ count &= 0x1F; /* for fickle pattern recognition */ -+ return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); -+} -+ -+MEM_STATIC -+U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { -+ assert(count < 16); -+ count &= 0x0F; /* for fickle pattern recognition */ -+ return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); -+} -+ -+#endif /* ZSTD_BITS_H */ -diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h -index feef3a1b1d60..6a13f1f0f1e8 100644 ---- a/lib/zstd/common/bitstream.h -+++ b/lib/zstd/common/bitstream.h -@@ -1,7 +1,8 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* ****************************************************************** - * bitstream - * Part of FSE library -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * You can contact the author at : - * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -27,6 +28,7 @@ - #include "compiler.h" /* UNLIKELY() */ - #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */ - #include "error_private.h" /* error codes and messages */ -+#include "bits.h" /* ZSTD_highbit32 */ - - - /*========================================= -@@ -79,19 +81,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); - /*-******************************************** - * bitStream decoding API (read backward) - **********************************************/ -+typedef size_t BitContainerType; - typedef struct { -- size_t bitContainer; -+ BitContainerType bitContainer; - unsigned bitsConsumed; - const char* ptr; - const char* start; - const char* limitPtr; - } BIT_DStream_t; - --typedef enum { BIT_DStream_unfinished = 0, -- BIT_DStream_endOfBuffer = 1, -- BIT_DStream_completed = 2, -- BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */ -- /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ -+typedef enum { BIT_DStream_unfinished = 0, /* fully refilled */ -+ BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */ -+ BIT_DStream_completed = 2, /* bitstream entirely consumed, bit-exact */ -+ BIT_DStream_overflow = 3 /* user requested more bits than present in bitstream */ -+ } BIT_DStream_status; /* result of BIT_reloadDStream() */ - - MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); - MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); -@@ -101,7 +104,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); - - /* Start by invoking BIT_initDStream(). - * A chunk of the bitStream is then stored into a local register. --* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). -+* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType). - * You can then retrieve bitFields stored into the local register, **in reverse order**. - * Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. - * A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. -@@ -122,33 +125,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); - MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); - /* faster, but works only if nbBits >= 1 */ - -- -- --/*-************************************************************** --* Internal functions --****************************************************************/ --MEM_STATIC unsigned BIT_highbit32 (U32 val) --{ -- assert(val != 0); -- { --# if (__GNUC__ >= 3) /* Use GCC Intrinsic */ -- return __builtin_clz (val) ^ 31; --# else /* Software version */ -- static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, -- 11, 14, 16, 18, 22, 25, 3, 30, -- 8, 12, 20, 28, 15, 17, 24, 7, -- 19, 27, 23, 6, 26, 5, 4, 31 }; -- U32 v = val; -- v |= v >> 1; -- v |= v >> 2; -- v |= v >> 4; -- v |= v >> 8; -- v |= v >> 16; -- return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; --# endif -- } --} -- - /*===== Local Constants =====*/ - static const unsigned BIT_mask[] = { - 0, 1, 3, 7, 0xF, 0x1F, -@@ -178,6 +154,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, - return 0; - } - -+FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) -+{ -+ assert(nbBits < BIT_MASK_SIZE); -+ return bitContainer & BIT_mask[nbBits]; -+} -+ - /*! BIT_addBits() : - * can add up to 31 bits into `bitC`. - * Note : does not check for register overflow ! */ -@@ -187,7 +169,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, - DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32); - assert(nbBits < BIT_MASK_SIZE); - assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); -- bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; -+ bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos; - bitC->bitPos += nbBits; - } - -@@ -266,35 +248,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si - bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); - bitD->bitContainer = MEM_readLEST(bitD->ptr); - { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; -- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ -+ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ - if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } - } else { - bitD->ptr = bitD->start; - bitD->bitContainer = *(const BYTE*)(bitD->start); - switch(srcSize) - { -- case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); -+ case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); - ZSTD_FALLTHROUGH; - -- case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); -+ case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); - ZSTD_FALLTHROUGH; - -- case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); -+ case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); - ZSTD_FALLTHROUGH; - -- case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; -+ case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24; - ZSTD_FALLTHROUGH; - -- case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; -+ case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16; - ZSTD_FALLTHROUGH; - -- case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; -+ case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) << 8; - ZSTD_FALLTHROUGH; - - default: break; - } - { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; -- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; -+ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; - if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ - } - bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; -@@ -303,12 +285,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si - return srcSize; - } - --MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start) -+FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start) - { - return bitContainer >> start; - } - --MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) -+FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits) - { - U32 const regMask = sizeof(bitContainer)*8 - 1; - /* if start > regMask, bitstream is corrupted, and result is undefined */ -@@ -325,19 +307,13 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c - #endif - } - --MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) --{ -- assert(nbBits < BIT_MASK_SIZE); -- return bitContainer & BIT_mask[nbBits]; --} -- - /*! BIT_lookBits() : - * Provides next n bits from local register. - * local register is not modified. - * On 32-bits, maxNbBits==24. - * On 64-bits, maxNbBits==56. - * @return : value extracted */ --MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) -+FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) - { - /* arbitrate between double-shift and shift+mask */ - #if 1 -@@ -360,7 +336,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) - return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); - } - --MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) -+FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) - { - bitD->bitsConsumed += nbBits; - } -@@ -369,7 +345,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) - * Read (consume) next n bits from local register and update. - * Pay attention to not read more than nbBits contained into local register. - * @return : extracted value. */ --MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) -+FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) - { - size_t const value = BIT_lookBits(bitD, nbBits); - BIT_skipBits(bitD, nbBits); -@@ -377,7 +353,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n - } - - /*! BIT_readBitsFast() : -- * unsafe version; only works only if nbBits >= 1 */ -+ * unsafe version; only works if nbBits >= 1 */ - MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) - { - size_t const value = BIT_lookBitsFast(bitD, nbBits); -@@ -386,6 +362,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) - return value; - } - -+/*! BIT_reloadDStream_internal() : -+ * Simple variant of BIT_reloadDStream(), with two conditions: -+ * 1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8 -+ * 2. look window is valid after shifted down : bitD->ptr >= bitD->start -+ */ -+MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD) -+{ -+ assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); -+ bitD->ptr -= bitD->bitsConsumed >> 3; -+ assert(bitD->ptr >= bitD->start); -+ bitD->bitsConsumed &= 7; -+ bitD->bitContainer = MEM_readLEST(bitD->ptr); -+ return BIT_DStream_unfinished; -+} -+ - /*! BIT_reloadDStreamFast() : - * Similar to BIT_reloadDStream(), but with two differences: - * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! -@@ -396,31 +387,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) - { - if (UNLIKELY(bitD->ptr < bitD->limitPtr)) - return BIT_DStream_overflow; -- assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); -- bitD->ptr -= bitD->bitsConsumed >> 3; -- bitD->bitsConsumed &= 7; -- bitD->bitContainer = MEM_readLEST(bitD->ptr); -- return BIT_DStream_unfinished; -+ return BIT_reloadDStream_internal(bitD); - } - - /*! BIT_reloadDStream() : - * Refill `bitD` from buffer previously set in BIT_initDStream() . -- * This function is safe, it guarantees it will not read beyond src buffer. -+ * This function is safe, it guarantees it will not never beyond src buffer. - * @return : status of `BIT_DStream_t` internal register. - * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ --MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) -+FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) - { -- if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ -+ /* note : once in overflow mode, a bitstream remains in this mode until it's reset */ -+ if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) { -+ static const BitContainerType zeroFilled = 0; -+ bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */ -+ /* overflow detected, erroneous scenario or end of stream: no update */ - return BIT_DStream_overflow; -+ } -+ -+ assert(bitD->ptr >= bitD->start); - - if (bitD->ptr >= bitD->limitPtr) { -- return BIT_reloadDStreamFast(bitD); -+ return BIT_reloadDStream_internal(bitD); - } - if (bitD->ptr == bitD->start) { -+ /* reached end of bitStream => no update */ - if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; - return BIT_DStream_completed; - } -- /* start < ptr < limitPtr */ -+ /* start < ptr < limitPtr => cautious update */ - { U32 nbBytes = bitD->bitsConsumed >> 3; - BIT_DStream_status result = BIT_DStream_unfinished; - if (bitD->ptr - nbBytes < bitD->start) { -diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h -index c42d39faf9bd..508ee25537bb 100644 ---- a/lib/zstd/common/compiler.h -+++ b/lib/zstd/common/compiler.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -11,6 +12,8 @@ - #ifndef ZSTD_COMPILER_H - #define ZSTD_COMPILER_H - -+#include -+ - #include "portability_macros.h" - - /*-******************************************************* -@@ -41,12 +44,15 @@ - */ - #define WIN_CDECL - -+/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ -+#define UNUSED_ATTR __attribute__((unused)) -+ - /* - * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant - * parameters. They must be inlined for the compiler to eliminate the constant - * branches. - */ --#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR -+#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR - /* - * HINT_INLINE is used to help the compiler generate better code. It is *not* - * used for "templates", so it can be tweaked based on the compilers -@@ -61,11 +67,21 @@ - #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5 - # define HINT_INLINE static INLINE_KEYWORD - #else --# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR -+# define HINT_INLINE FORCE_INLINE_TEMPLATE - #endif - --/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ --#define UNUSED_ATTR __attribute__((unused)) -+/* "soft" inline : -+ * The compiler is free to select if it's a good idea to inline or not. -+ * The main objective is to silence compiler warnings -+ * when a defined function in included but not used. -+ * -+ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit. -+ * Updating the prefix is probably preferable, but requires a fairly large codemod, -+ * since this name is used everywhere. -+ */ -+#ifndef MEM_STATIC /* already defined in Linux Kernel mem.h */ -+#define MEM_STATIC static __inline UNUSED_ATTR -+#endif - - /* force no inlining */ - #define FORCE_NOINLINE static __attribute__((__noinline__)) -@@ -86,23 +102,24 @@ - # define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) - # define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) - #elif defined(__aarch64__) --# define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))) --# define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))) -+# define PREFETCH_L1(ptr) do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0) -+# define PREFETCH_L2(ptr) do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0) - #else --# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ --# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ -+# define PREFETCH_L1(ptr) do { (void)(ptr); } while (0) /* disabled */ -+# define PREFETCH_L2(ptr) do { (void)(ptr); } while (0) /* disabled */ - #endif /* NO_PREFETCH */ - - #define CACHELINE_SIZE 64 - --#define PREFETCH_AREA(p, s) { \ -- const char* const _ptr = (const char*)(p); \ -- size_t const _size = (size_t)(s); \ -- size_t _pos; \ -- for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ -- PREFETCH_L2(_ptr + _pos); \ -- } \ --} -+#define PREFETCH_AREA(p, s) \ -+ do { \ -+ const char* const _ptr = (const char*)(p); \ -+ size_t const _size = (size_t)(s); \ -+ size_t _pos; \ -+ for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ -+ PREFETCH_L2(_ptr + _pos); \ -+ } \ -+ } while (0) - - /* vectorization - * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, -@@ -126,9 +143,9 @@ - #define UNLIKELY(x) (__builtin_expect((x), 0)) - - #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))) --# define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); } -+# define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0) - #else --# define ZSTD_UNREACHABLE { assert(0); } -+# define ZSTD_UNREACHABLE do { assert(0); } while (0) - #endif - - /* disable warnings */ -@@ -179,6 +196,85 @@ - * Sanitizer - *****************************************************************/ - -+/* -+ * Zstd relies on pointer overflow in its decompressor. -+ * We add this attribute to functions that rely on pointer overflow. -+ */ -+#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+# if __has_attribute(no_sanitize) -+# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8 -+ /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */ -+# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow"))) -+# else -+ /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */ -+# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow"))) -+# endif -+# else -+# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+# endif -+#endif -+ -+/* -+ * Helper function to perform a wrapped pointer difference without trigging -+ * UBSAN. -+ * -+ * @returns lhs - rhs with wrapping -+ */ -+MEM_STATIC -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs) -+{ -+ return lhs - rhs; -+} -+ -+/* -+ * Helper function to perform a wrapped pointer add without triggering UBSAN. -+ * -+ * @return ptr + add with wrapping -+ */ -+MEM_STATIC -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add) -+{ -+ return ptr + add; -+} -+ -+/* -+ * Helper function to perform a wrapped pointer subtraction without triggering -+ * UBSAN. -+ * -+ * @return ptr - sub with wrapping -+ */ -+MEM_STATIC -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub) -+{ -+ return ptr - sub; -+} -+ -+/* -+ * Helper function to add to a pointer that works around C's undefined behavior -+ * of adding 0 to NULL. -+ * -+ * @returns `ptr + add` except it defines `NULL + 0 == NULL`. -+ */ -+MEM_STATIC -+unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add) -+{ -+ return add > 0 ? ptr + add : ptr; -+} -+ -+/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an -+ * abundance of caution, disable our custom poisoning on mingw. */ -+#ifdef __MINGW32__ -+#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE -+#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1 -+#endif -+#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE -+#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1 -+#endif -+#endif -+ - - - #endif /* ZSTD_COMPILER_H */ -diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h -index 0db7b42407ee..d8319a2bef4c 100644 ---- a/lib/zstd/common/cpu.h -+++ b/lib/zstd/common/cpu.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c -index bb863c9ea616..8eb6aa9a3b20 100644 ---- a/lib/zstd/common/debug.c -+++ b/lib/zstd/common/debug.c -@@ -1,7 +1,8 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* ****************************************************************** - * debug - * Part of FSE library -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * You can contact the author at : - * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -21,4 +22,10 @@ - - #include "debug.h" - -+#if (DEBUGLEVEL>=2) -+/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a -+ * translation unit is empty. So remove this from Linux kernel builds, but -+ * otherwise just leave it in. -+ */ - int g_debuglevel = DEBUGLEVEL; -+#endif -diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h -index 6dd88d1fbd02..226ba3c57ec3 100644 ---- a/lib/zstd/common/debug.h -+++ b/lib/zstd/common/debug.h -@@ -1,7 +1,8 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* ****************************************************************** - * debug - * Part of FSE library -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * You can contact the author at : - * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -82,18 +83,27 @@ extern int g_debuglevel; /* the variable is only declared, - It's useful when enabling very verbose levels - on selective conditions (such as position in src) */ - --# define RAWLOG(l, ...) { \ -- if (l<=g_debuglevel) { \ -- ZSTD_DEBUG_PRINT(__VA_ARGS__); \ -- } } --# define DEBUGLOG(l, ...) { \ -- if (l<=g_debuglevel) { \ -- ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \ -- ZSTD_DEBUG_PRINT(" \n"); \ -- } } -+# define RAWLOG(l, ...) \ -+ do { \ -+ if (l<=g_debuglevel) { \ -+ ZSTD_DEBUG_PRINT(__VA_ARGS__); \ -+ } \ -+ } while (0) -+ -+#define STRINGIFY(x) #x -+#define TOSTRING(x) STRINGIFY(x) -+#define LINE_AS_STRING TOSTRING(__LINE__) -+ -+# define DEBUGLOG(l, ...) \ -+ do { \ -+ if (l<=g_debuglevel) { \ -+ ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \ -+ ZSTD_DEBUG_PRINT(" \n"); \ -+ } \ -+ } while (0) - #else --# define RAWLOG(l, ...) {} /* disabled */ --# define DEBUGLOG(l, ...) {} /* disabled */ -+# define RAWLOG(l, ...) do { } while (0) /* disabled */ -+# define DEBUGLOG(l, ...) do { } while (0) /* disabled */ - #endif - - -diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c -index fef67056f052..6cdd82233fb5 100644 ---- a/lib/zstd/common/entropy_common.c -+++ b/lib/zstd/common/entropy_common.c -@@ -1,6 +1,7 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* ****************************************************************** - * Common functions of New Generation Entropy library -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * You can contact the author at : - * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -19,8 +20,8 @@ - #include "error_private.h" /* ERR_*, ERROR */ - #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ - #include "fse.h" --#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */ - #include "huf.h" -+#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */ - - - /*=== Version ===*/ -@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); } - /*-************************************************************** - * FSE NCount encoding-decoding - ****************************************************************/ --static U32 FSE_ctz(U32 val) --{ -- assert(val != 0); -- { --# if (__GNUC__ >= 3) /* GCC Intrinsic */ -- return __builtin_ctz(val); --# else /* Software version */ -- U32 count = 0; -- while ((val & 1) == 0) { -- val >>= 1; -- ++count; -- } -- return count; --# endif -- } --} -- - FORCE_INLINE_TEMPLATE - size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, - const void* headerBuffer, size_t hbSize) -@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne - * repeat. - * Avoid UB by setting the high bit to 1. - */ -- int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; -+ int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; - while (repeats >= 12) { - charnum += 3 * 12; - if (LIKELY(ip <= iend-7)) { -@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne - ip = iend - 4; - } - bitStream = MEM_readLE32(ip) >> bitCount; -- repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; -+ repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; - } - charnum += 3 * repeats; - bitStream >>= 2 * repeats; -@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne - * know that threshold > 1. - */ - if (remaining <= 1) break; -- nbBits = BIT_highbit32(remaining) + 1; -+ nbBits = ZSTD_highbit32(remaining) + 1; - threshold = 1 << (nbBits - 1); - } - if (charnum >= maxSV1) break; -@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, - const void* src, size_t srcSize) - { - U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; -- return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0); -+ return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0); - } - - FORCE_INLINE_TEMPLATE size_t -@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, - if (weightTotal == 0) return ERROR(corruption_detected); - - /* get last non-null symbol weight (implied, total must be 2^n) */ -- { U32 const tableLog = BIT_highbit32(weightTotal) + 1; -+ { U32 const tableLog = ZSTD_highbit32(weightTotal) + 1; - if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected); - *tableLogPtr = tableLog; - /* determine last weight */ - { U32 const total = 1 << tableLog; - U32 const rest = total - weightTotal; -- U32 const verif = 1 << BIT_highbit32(rest); -- U32 const lastWeight = BIT_highbit32(rest) + 1; -+ U32 const verif = 1 << ZSTD_highbit32(rest); -+ U32 const lastWeight = ZSTD_highbit32(rest) + 1; - if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ - huffWeight[oSize] = (BYTE)lastWeight; - rankStats[lastWeight]++; -@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, - U32* nbSymbolsPtr, U32* tableLogPtr, - const void* src, size_t srcSize, - void* workSpace, size_t wkspSize, -- int bmi2) -+ int flags) - { - #if DYNAMIC_BMI2 -- if (bmi2) { -+ if (flags & HUF_flags_bmi2) { - return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); - } - #endif -- (void)bmi2; -+ (void)flags; - return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); - } -diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c -index 6d1135f8c373..a4062d30d170 100644 ---- a/lib/zstd/common/error_private.c -+++ b/lib/zstd/common/error_private.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code) - case PREFIX(version_unsupported): return "Version not supported"; - case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; - case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; -- case PREFIX(corruption_detected): return "Corrupted block detected"; -+ case PREFIX(corruption_detected): return "Data corruption detected"; - case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; -+ case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification"; - case PREFIX(parameter_unsupported): return "Unsupported parameter"; -+ case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters"; - case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; - case PREFIX(init_missing): return "Context should be init first"; - case PREFIX(memory_allocation): return "Allocation error : not enough memory"; -@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code) - case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; - case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; - case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; -+ case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected"; - case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; - case PREFIX(dictionary_wrong): return "Dictionary mismatch"; - case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; - case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; - case PREFIX(srcSize_wrong): return "Src size is incorrect"; - case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer"; -+ case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full"; -+ case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty"; - /* following error codes are not stable and may be removed or changed in a future version */ - case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; - case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; - case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; - case PREFIX(srcBuffer_wrong): return "Source buffer is wrong"; -+ case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code"; -+ case PREFIX(externalSequences_invalid): return "External sequences are not valid"; - case PREFIX(maxCode): - default: return notErrorCode; - } -diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h -index ca5101e542fa..0410ca415b54 100644 ---- a/lib/zstd/common/error_private.h -+++ b/lib/zstd/common/error_private.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -49,8 +50,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } - ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } - - /* check and forward error code */ --#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e --#define CHECK_F(f) { CHECK_V_F(_var_err__, f); } -+#define CHECK_V_F(e, f) \ -+ size_t const e = f; \ -+ do { \ -+ if (ERR_isError(e)) \ -+ return e; \ -+ } while (0) -+#define CHECK_F(f) do { CHECK_V_F(_var_err__, f); } while (0) - - - /*-**************************************** -@@ -84,10 +90,12 @@ void _force_has_format_string(const char *format, ...) { - * We want to force this function invocation to be syntactically correct, but - * we don't want to force runtime evaluation of its arguments. - */ --#define _FORCE_HAS_FORMAT_STRING(...) \ -- if (0) { \ -- _force_has_format_string(__VA_ARGS__); \ -- } -+#define _FORCE_HAS_FORMAT_STRING(...) \ -+ do { \ -+ if (0) { \ -+ _force_has_format_string(__VA_ARGS__); \ -+ } \ -+ } while (0) - - #define ERR_QUOTE(str) #str - -@@ -98,48 +106,50 @@ void _force_has_format_string(const char *format, ...) { - * In order to do that (particularly, printing the conditional that failed), - * this can't just wrap RETURN_ERROR(). - */ --#define RETURN_ERROR_IF(cond, err, ...) \ -- if (cond) { \ -- RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ -- __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ -- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ -- RAWLOG(3, ": " __VA_ARGS__); \ -- RAWLOG(3, "\n"); \ -- return ERROR(err); \ -- } -+#define RETURN_ERROR_IF(cond, err, ...) \ -+ do { \ -+ if (cond) { \ -+ RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ -+ __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ -+ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ -+ RAWLOG(3, ": " __VA_ARGS__); \ -+ RAWLOG(3, "\n"); \ -+ return ERROR(err); \ -+ } \ -+ } while (0) - - /* - * Unconditionally return the specified error. - * - * In debug modes, prints additional information. - */ --#define RETURN_ERROR(err, ...) \ -- do { \ -- RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ -- __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ -- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ -- RAWLOG(3, ": " __VA_ARGS__); \ -- RAWLOG(3, "\n"); \ -- return ERROR(err); \ -- } while(0); -+#define RETURN_ERROR(err, ...) \ -+ do { \ -+ RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ -+ __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ -+ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ -+ RAWLOG(3, ": " __VA_ARGS__); \ -+ RAWLOG(3, "\n"); \ -+ return ERROR(err); \ -+ } while(0) - - /* - * If the provided expression evaluates to an error code, returns that error code. - * - * In debug modes, prints additional information. - */ --#define FORWARD_IF_ERROR(err, ...) \ -- do { \ -- size_t const err_code = (err); \ -- if (ERR_isError(err_code)) { \ -- RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ -- __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ -- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ -- RAWLOG(3, ": " __VA_ARGS__); \ -- RAWLOG(3, "\n"); \ -- return err_code; \ -- } \ -- } while(0); -+#define FORWARD_IF_ERROR(err, ...) \ -+ do { \ -+ size_t const err_code = (err); \ -+ if (ERR_isError(err_code)) { \ -+ RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ -+ __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ -+ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ -+ RAWLOG(3, ": " __VA_ARGS__); \ -+ RAWLOG(3, "\n"); \ -+ return err_code; \ -+ } \ -+ } while(0) - - - #endif /* ERROR_H_MODULE */ -diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h -index 4507043b2287..2185a578617d 100644 ---- a/lib/zstd/common/fse.h -+++ b/lib/zstd/common/fse.h -@@ -1,7 +1,8 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* ****************************************************************** - * FSE : Finite State Entropy codec - * Public Prototypes declaration -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * You can contact the author at : - * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -50,34 +51,6 @@ - FSE_PUBLIC_API unsigned FSE_versionNumber(void); /*< library version number; to be used when checking dll version */ - - --/*-**************************************** --* FSE simple functions --******************************************/ --/*! FSE_compress() : -- Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'. -- 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize). -- @return : size of compressed data (<= dstCapacity). -- Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! -- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. -- if FSE_isError(return), compression failed (more details using FSE_getErrorName()) --*/ --FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, -- const void* src, size_t srcSize); -- --/*! FSE_decompress(): -- Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', -- into already allocated destination buffer 'dst', of size 'dstCapacity'. -- @return : size of regenerated data (<= maxDstSize), -- or an error code, which can be tested using FSE_isError() . -- -- ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!! -- Why ? : making this distinction requires a header. -- Header management is intentionally delegated to the user layer, which can better manage special cases. --*/ --FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, -- const void* cSrc, size_t cSrcSize); -- -- - /*-***************************************** - * Tool functions - ******************************************/ -@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return - FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ - - --/*-***************************************** --* FSE advanced functions --******************************************/ --/*! FSE_compress2() : -- Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog' -- Both parameters can be defined as '0' to mean : use default value -- @return : size of compressed data -- Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!! -- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. -- if FSE_isError(return), it's an error code. --*/ --FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); -- -- - /*-***************************************** - * FSE detailed API - ******************************************/ -@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, - /*! Constructor and Destructor of FSE_CTable. - Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ - typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ --FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); --FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); - - /*! FSE_buildCTable(): - Builds `ct`, which must be already allocated, using FSE_createCTable(). -@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter, - unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, - const void* rBuffer, size_t rBuffSize, int bmi2); - --/*! Constructor and Destructor of FSE_DTable. -- Note that its size depends on 'tableLog' */ - typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ --FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); --FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); -- --/*! FSE_buildDTable(): -- Builds 'dt', which must be already allocated, using FSE_createDTable(). -- return : 0, or an errorCode, which can be tested using FSE_isError() */ --FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); -- --/*! FSE_decompress_usingDTable(): -- Decompress compressed source `cSrc` of size `cSrcSize` using `dt` -- into `dst` which must be already allocated. -- @return : size of regenerated data (necessarily <= `dstCapacity`), -- or an errorCode, which can be tested using FSE_isError() */ --FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); - - /*! - Tutorial : -@@ -286,6 +227,7 @@ If there is an error, the function will return an error code, which can be teste - - #endif /* FSE_H */ - -+ - #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY) - #define FSE_H_FSE_STATIC_LINKING_ONLY - -@@ -317,16 +259,6 @@ If there is an error, the function will return an error code, which can be teste - unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus); - /*< same as FSE_optimalTableLog(), which used `minus==2` */ - --/* FSE_compress_wksp() : -- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). -- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable. -- */ --#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) ) --size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); -- --size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); --/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ -- - size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); - /*< build a fake FSE_CTable, designed to compress always the same symbolValue */ - -@@ -344,19 +276,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi - FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); - /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ - --size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits); --/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */ -- --size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue); --/*< build a fake FSE_DTable, designed to always generate the same symbolValue */ -- --#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) -+#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) - #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) --size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize); --/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */ -- - size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2); --/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */ -+/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`. -+ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */ - - typedef enum { - FSE_repeat_none, /*< Cannot use the previous table */ -@@ -539,20 +463,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un - FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; - const U16* const stateTable = (const U16*)(statePtr->stateTable); - U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); -- BIT_addBits(bitC, statePtr->value, nbBitsOut); -+ BIT_addBits(bitC, (size_t)statePtr->value, nbBitsOut); - statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; - } - - MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) - { -- BIT_addBits(bitC, statePtr->value, statePtr->stateLog); -+ BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog); - BIT_flushBits(bitC); - } - - - /* FSE_getMaxNbBits() : - * Approximate maximum cost of a symbol, in bits. -- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) -+ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) - * note 1 : assume symbolValue is valid (<= maxSymbolValue) - * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ - MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) -diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c -index 8dcb8ca39767..3a17e84f27bf 100644 ---- a/lib/zstd/common/fse_decompress.c -+++ b/lib/zstd/common/fse_decompress.c -@@ -1,6 +1,7 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* ****************************************************************** - * FSE : Finite State Entropy decoder -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * You can contact the author at : - * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -22,8 +23,8 @@ - #define FSE_STATIC_LINKING_ONLY - #include "fse.h" - #include "error_private.h" --#define ZSTD_DEPS_NEED_MALLOC --#include "zstd_deps.h" -+#include "zstd_deps.h" /* ZSTD_memcpy */ -+#include "bits.h" /* ZSTD_highbit32 */ - - - /* ************************************************************** -@@ -55,19 +56,6 @@ - #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) - #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) - -- --/* Function templates */ --FSE_DTable* FSE_createDTable (unsigned tableLog) --{ -- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; -- return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); --} -- --void FSE_freeDTable (FSE_DTable* dt) --{ -- ZSTD_free(dt); --} -- - static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) - { - void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ -@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo - symbolNext[s] = 1; - } else { - if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; -- symbolNext[s] = normalizedCounter[s]; -+ symbolNext[s] = (U16)normalizedCounter[s]; - } } } - ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); - } -@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo - * all symbols have counts <= 8. We ensure we have 8 bytes at the end of - * our buffer to handle the over-write. - */ -- { -- U64 const add = 0x0101010101010101ull; -+ { U64 const add = 0x0101010101010101ull; - size_t pos = 0; - U64 sv = 0; - U32 s; -@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo - for (i = 8; i < n; i += 8) { - MEM_write64(spread + pos + i, sv); - } -- pos += n; -- } -- } -+ pos += (size_t)n; -+ } } - /* Now we spread those positions across the table. -- * The benefit of doing it in two stages is that we avoid the the -+ * The benefit of doing it in two stages is that we avoid the - * variable size inner loop, which caused lots of branch misses. - * Now we can run through all the positions without any branch misses. -- * We unroll the loop twice, since that is what emperically worked best. -+ * We unroll the loop twice, since that is what empirically worked best. - */ - { - size_t position = 0; -@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo - for (u=0; utableLog = 0; -- DTableH->fastMode = 0; -- -- cell->newState = 0; -- cell->symbol = symbolValue; -- cell->nbBits = 0; -- -- return 0; --} -- -- --size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) --{ -- void* ptr = dt; -- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; -- void* dPtr = dt + 1; -- FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; -- const unsigned tableSize = 1 << nbBits; -- const unsigned tableMask = tableSize - 1; -- const unsigned maxSV1 = tableMask+1; -- unsigned s; -- -- /* Sanity checks */ -- if (nbBits < 1) return ERROR(GENERIC); /* min size */ -- -- /* Build Decoding Table */ -- DTableH->tableLog = (U16)nbBits; -- DTableH->fastMode = 1; -- for (s=0; sfastMode; -- -- /* select fast mode (static) */ -- if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); -- return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); --} -- -- --size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) --{ -- return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0); -+ assert(op >= ostart); -+ return (size_t)(op-ostart); - } - - typedef struct { - short ncount[FSE_MAX_SYMBOL_VALUE + 1]; -- FSE_DTable dtable[]; /* Dynamically sized */ - } FSE_DecompressWksp; - - -@@ -327,13 +250,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( - unsigned tableLog; - unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; - FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace; -+ size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable); -+ FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos; - -- DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); -+ FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); - if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC); - -+ /* correct offset to dtable depends on this property */ -+ FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0); -+ - /* normal FSE decoding mode */ -- { -- size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); -+ { size_t const NCountLength = -+ FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); - if (FSE_isError(NCountLength)) return NCountLength; - if (tableLog > maxLog) return ERROR(tableLog_tooLarge); - assert(NCountLength <= cSrcSize); -@@ -342,19 +270,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( - } - - if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); -- workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog); -+ assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize); -+ workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); - wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); - -- CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); -+ CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); - - { -- const void* ptr = wksp->dtable; -+ const void* ptr = dtable; - const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; - const U32 fastMode = DTableH->fastMode; - - /* select fast mode (static) */ -- if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1); -- return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0); -+ if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1); -+ return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0); - } - } - -@@ -382,9 +311,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, - return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); - } - -- --typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; -- -- -- - #endif /* FSE_COMMONDEFS_ONLY */ -diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h -index 5042ff870308..57462466e188 100644 ---- a/lib/zstd/common/huf.h -+++ b/lib/zstd/common/huf.h -@@ -1,7 +1,8 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* ****************************************************************** - * huff0 huffman codec, - * part of Finite State Entropy library -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * You can contact the author at : - * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -18,99 +19,22 @@ - - /* *** Dependencies *** */ - #include "zstd_deps.h" /* size_t */ -- -- --/* *** library symbols visibility *** */ --/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, -- * HUF symbols remain "private" (internal symbols for library only). -- * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ --#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) --# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) --#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ --# define HUF_PUBLIC_API __declspec(dllexport) --#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) --# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ --#else --# define HUF_PUBLIC_API --#endif -- -- --/* ========================== */ --/* *** simple functions *** */ --/* ========================== */ -- --/* HUF_compress() : -- * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. -- * 'dst' buffer must be already allocated. -- * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). -- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. -- * @return : size of compressed data (<= `dstCapacity`). -- * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! -- * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) -- */ --HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, -- const void* src, size_t srcSize); -- --/* HUF_decompress() : -- * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', -- * into already allocated buffer 'dst', of minimum size 'dstSize'. -- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. -- * Note : in contrast with FSE, HUF_decompress can regenerate -- * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, -- * because it knows size to regenerate (originalSize). -- * @return : size of regenerated data (== originalSize), -- * or an error code, which can be tested using HUF_isError() -- */ --HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, -- const void* cSrc, size_t cSrcSize); -+#include "mem.h" /* U32 */ -+#define FSE_STATIC_LINKING_ONLY -+#include "fse.h" - - - /* *** Tool functions *** */ --#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ --HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ -+#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ -+size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ - - /* Error Management */ --HUF_PUBLIC_API unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ --HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ -- -+unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ -+const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ - --/* *** Advanced function *** */ - --/* HUF_compress2() : -- * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. -- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . -- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ --HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, -- const void* src, size_t srcSize, -- unsigned maxSymbolValue, unsigned tableLog); -- --/* HUF_compress4X_wksp() : -- * Same as HUF_compress2(), but uses externally allocated `workSpace`. -- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ - #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) - #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) --HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, -- const void* src, size_t srcSize, -- unsigned maxSymbolValue, unsigned tableLog, -- void* workSpace, size_t wkspSize); -- --#endif /* HUF_H_298734234 */ -- --/* ****************************************************************** -- * WARNING !! -- * The following section contains advanced and experimental definitions -- * which shall never be used in the context of a dynamic library, -- * because they are not guaranteed to remain stable in the future. -- * Only consider them in association with static linking. -- * *****************************************************************/ --#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY) --#define HUF_H_HUF_STATIC_LINKING_ONLY -- --/* *** Dependencies *** */ --#include "mem.h" /* U32 */ --#define FSE_STATIC_LINKING_ONLY --#include "fse.h" -- - - /* *** Constants *** */ - #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ -@@ -151,25 +75,49 @@ typedef U32 HUF_DTable; - /* **************************************** - * Advanced decompression functions - ******************************************/ --size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ --#ifndef HUF_FORCE_DECOMPRESS_X1 --size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ --#endif - --size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< decodes RLE and uncompressed */ --size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */ --size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */ --size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ --size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ --#ifndef HUF_FORCE_DECOMPRESS_X1 --size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ --size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ --#endif -+/* -+ * Huffman flags bitset. -+ * For all flags, 0 is the default value. -+ */ -+typedef enum { -+ /* -+ * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime. -+ * Otherwise: Ignored. -+ */ -+ HUF_flags_bmi2 = (1 << 0), -+ /* -+ * If set: Test possible table depths to find the one that produces the smallest header + encoded size. -+ * If unset: Use heuristic to find the table depth. -+ */ -+ HUF_flags_optimalDepth = (1 << 1), -+ /* -+ * If set: If the previous table can encode the input, always reuse the previous table. -+ * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output. -+ */ -+ HUF_flags_preferRepeat = (1 << 2), -+ /* -+ * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress. -+ * If unset: Always histogram the entire input. -+ */ -+ HUF_flags_suspectUncompressible = (1 << 3), -+ /* -+ * If set: Don't use assembly implementations -+ * If unset: Allow using assembly implementations -+ */ -+ HUF_flags_disableAsm = (1 << 4), -+ /* -+ * If set: Don't use the fast decoding loop, always use the fallback decoding loop. -+ * If unset: Use the fast decoding loop when possible. -+ */ -+ HUF_flags_disableFast = (1 << 5) -+} HUF_flags_e; - - - /* **************************************** - * HUF detailed API - * ****************************************/ -+#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra - - /*! HUF_compress() does the following: - * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") -@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, - * For example, it's possible to compress several blocks using the same 'CTable', - * or to save and regenerate 'CTable' using external methods. - */ --unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); --size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ --size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); -+unsigned HUF_minTableLog(unsigned symbolCardinality); -+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue); -+unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, -+ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */ - size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize); --size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); --size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); -+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); - size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); - int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); - -@@ -196,6 +144,7 @@ typedef enum { - HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ - HUF_repeat_valid /*< Can use the previous table and it is assumed to be valid */ - } HUF_repeat; -+ - /* HUF_compress4X_repeat() : - * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. - * If it uses hufTable it does not modify hufTable or repeat. -@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned tableLog, - void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ -- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); -+ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); - - /* HUF_buildCTable_wksp() : - * Same as HUF_buildCTable(), but using externally allocated scratch buffer. - * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. - */ --#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) -+#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192) - #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) - size_t HUF_buildCTable_wksp (HUF_CElt* tree, - const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, -@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, - U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, - const void* src, size_t srcSize, - void* workspace, size_t wkspSize, -- int bmi2); -+ int flags); - - /* HUF_readCTable() : - * Loading a CTable saved with HUF_writeCTable() */ -@@ -246,9 +195,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void - - /* HUF_getNbBitsFromCTable() : - * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX -- * Note 1 : is not inlined, as HUF_CElt definition is private */ -+ * Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0 -+ * Note 2 : is not inlined, as HUF_CElt definition is private -+ */ - U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue); - -+typedef struct { -+ BYTE tableLog; -+ BYTE maxSymbolValue; -+ BYTE unused[sizeof(size_t) - 2]; -+} HUF_CTableHeader; -+ -+/* HUF_readCTableHeader() : -+ * @returns The header from the CTable specifying the tableLog and the maxSymbolValue. -+ */ -+HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable); -+ - /* - * HUF_decompress() does the following: - * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics -@@ -276,32 +238,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); - #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9)) - #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) - --#ifndef HUF_FORCE_DECOMPRESS_X2 --size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize); --size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); --#endif --#ifndef HUF_FORCE_DECOMPRESS_X1 --size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); --size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); --#endif -- --size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); --#ifndef HUF_FORCE_DECOMPRESS_X2 --size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); --#endif --#ifndef HUF_FORCE_DECOMPRESS_X1 --size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); --#endif -- - - /* ====================== */ - /* single stream variants */ - /* ====================== */ - --size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); --size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */ --size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); --size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); -+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); - /* HUF_compress1X_repeat() : - * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. - * If it uses hufTable it does not modify hufTable or repeat. -@@ -312,47 +254,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned tableLog, - void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ -- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); -+ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); - --size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ --#ifndef HUF_FORCE_DECOMPRESS_X1 --size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ --#endif -- --size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); --size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); --#ifndef HUF_FORCE_DECOMPRESS_X2 --size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ --size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ --#endif -+size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); - #ifndef HUF_FORCE_DECOMPRESS_X1 --size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ --size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ --#endif -- --size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /*< automatic selection of sing or double symbol decoder, based on DTable */ --#ifndef HUF_FORCE_DECOMPRESS_X2 --size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); --#endif --#ifndef HUF_FORCE_DECOMPRESS_X1 --size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); -+size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /*< double-symbols decoder */ - #endif - - /* BMI2 variants. - * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. - */ --size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); -+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); - #ifndef HUF_FORCE_DECOMPRESS_X2 --size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); -+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); - #endif --size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); --size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); -+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); -+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); - #ifndef HUF_FORCE_DECOMPRESS_X2 --size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); -+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); - #endif - #ifndef HUF_FORCE_DECOMPRESS_X1 --size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); -+size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); - #endif - --#endif /* HUF_STATIC_LINKING_ONLY */ -+#endif /* HUF_H_298734234 */ - -diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h -index c22a2e69bf46..d9bd752fe17b 100644 ---- a/lib/zstd/common/mem.h -+++ b/lib/zstd/common/mem.h -@@ -1,6 +1,6 @@ - /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -24,6 +24,7 @@ - /*-**************************************** - * Compiler specifics - ******************************************/ -+#undef MEM_STATIC /* may be already defined from common/compiler.h */ - #define MEM_STATIC static inline - - /*-************************************************************** -diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h -index 0e3b2c0a527d..f08638cced6c 100644 ---- a/lib/zstd/common/portability_macros.h -+++ b/lib/zstd/common/portability_macros.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -12,7 +13,7 @@ - #define ZSTD_PORTABILITY_MACROS_H - - /* -- * This header file contains macro defintions to support portability. -+ * This header file contains macro definitions to support portability. - * This header is shared between C and ASM code, so it MUST only - * contain macro definitions. It MUST not contain any C code. - * -@@ -45,6 +46,8 @@ - /* Mark the internal assembly functions as hidden */ - #ifdef __ELF__ - # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func -+#elif defined(__APPLE__) -+# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func - #else - # define ZSTD_HIDE_ASM_FUNCTION(func) - #endif -@@ -65,7 +68,7 @@ - #endif - - /* -- * Only enable assembly for GNUC comptabile compilers, -+ * Only enable assembly for GNUC compatible compilers, - * because other platforms may not support GAS assembly syntax. - * - * Only enable assembly for Linux / MacOS, other platforms may -@@ -90,4 +93,23 @@ - */ - #define ZSTD_ENABLE_ASM_X86_64_BMI2 0 - -+/* -+ * For x86 ELF targets, add .note.gnu.property section for Intel CET in -+ * assembly sources when CET is enabled. -+ * -+ * Additionally, any function that may be called indirectly must begin -+ * with ZSTD_CET_ENDBRANCH. -+ */ -+#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \ -+ && defined(__has_include) -+# if __has_include() -+# include -+# define ZSTD_CET_ENDBRANCH _CET_ENDBR -+# endif -+#endif -+ -+#ifndef ZSTD_CET_ENDBRANCH -+# define ZSTD_CET_ENDBRANCH -+#endif -+ - #endif /* ZSTD_PORTABILITY_MACROS_H */ -diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c -index 3d7e35b309b5..44b95b25344a 100644 ---- a/lib/zstd/common/zstd_common.c -+++ b/lib/zstd/common/zstd_common.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -14,7 +15,6 @@ - * Dependencies - ***************************************/ - #define ZSTD_DEPS_NEED_MALLOC --#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ - #include "error_private.h" - #include "zstd_internal.h" - -@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } - /*! ZSTD_getErrorString() : - * provides error code string from enum */ - const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } -- -- -- --/*=************************************************************** --* Custom allocator --****************************************************************/ --void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) --{ -- if (customMem.customAlloc) -- return customMem.customAlloc(customMem.opaque, size); -- return ZSTD_malloc(size); --} -- --void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) --{ -- if (customMem.customAlloc) { -- /* calloc implemented as malloc+memset; -- * not as efficient as calloc, but next best guess for custom malloc */ -- void* const ptr = customMem.customAlloc(customMem.opaque, size); -- ZSTD_memset(ptr, 0, size); -- return ptr; -- } -- return ZSTD_calloc(1, size); --} -- --void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) --{ -- if (ptr!=NULL) { -- if (customMem.customFree) -- customMem.customFree(customMem.opaque, ptr); -- else -- ZSTD_free(ptr); -- } --} -diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h -index 2c34e8a33a1c..f931f7d0e294 100644 ---- a/lib/zstd/common/zstd_deps.h -+++ b/lib/zstd/common/zstd_deps.h -@@ -1,6 +1,6 @@ - /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) { - - #endif /* ZSTD_DEPS_IO */ - #endif /* ZSTD_DEPS_NEED_IO */ -+ -+/* -+ * Only requested when MSAN is enabled. -+ * Need: -+ * intptr_t -+ */ -+#ifdef ZSTD_DEPS_NEED_STDINT -+#ifndef ZSTD_DEPS_STDINT -+#define ZSTD_DEPS_STDINT -+ -+/* intptr_t already provided by ZSTD_DEPS_COMMON */ -+ -+#endif /* ZSTD_DEPS_STDINT */ -+#endif /* ZSTD_DEPS_NEED_STDINT */ -diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h -index 93305d9b41bb..11da1233e890 100644 ---- a/lib/zstd/common/zstd_internal.h -+++ b/lib/zstd/common/zstd_internal.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -28,7 +29,6 @@ - #include - #define FSE_STATIC_LINKING_ONLY - #include "fse.h" --#define HUF_STATIC_LINKING_ONLY - #include "huf.h" - #include /* XXH_reset, update, digest */ - #define ZSTD_TRACE 0 -@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; - #define ZSTD_FRAMECHECKSUMSIZE 4 - - #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ --#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ -+#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */ -+#define MIN_LITERALS_FOR_4_STREAMS 6 - --#define HufLog 12 - typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; - - #define LONGNBSEQ 0x7F00 -@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy - #define MINMATCH 3 - - #define Litbits 8 -+#define LitHufLog 11 - #define MaxLit ((1<= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); -@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e - * one COPY16() in the first call. Then, do two calls per loop since - * at that point it is more likely to have a high trip count. - */ --#ifdef __aarch64__ -- do { -- COPY16(op, ip); -- } -- while (op < oend); --#else - ZSTD_copy16(op, ip); - if (16 >= length) return; - op += 16; -@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e - COPY16(op, ip); - } - while (op < oend); --#endif - } - } - -@@ -289,11 +285,11 @@ typedef enum { - typedef struct { - seqDef* sequencesStart; - seqDef* sequences; /* ptr to end of sequences */ -- BYTE* litStart; -- BYTE* lit; /* ptr to end of literals */ -- BYTE* llCode; -- BYTE* mlCode; -- BYTE* ofCode; -+ BYTE* litStart; -+ BYTE* lit; /* ptr to end of literals */ -+ BYTE* llCode; -+ BYTE* mlCode; -+ BYTE* ofCode; - size_t maxNbSeq; - size_t maxNbLit; - -@@ -301,8 +297,8 @@ typedef struct { - * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment - * the existing value of the litLength or matchLength by 0x10000. - */ -- ZSTD_longLengthType_e longLengthType; -- U32 longLengthPos; /* Index of the sequence to apply long length modification to */ -+ ZSTD_longLengthType_e longLengthType; -+ U32 longLengthPos; /* Index of the sequence to apply long length modification to */ - } seqStore_t; - - typedef struct { -@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore - seqLen.matchLength = seq->mlBase + MINMATCH; - if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { - if (seqStore->longLengthType == ZSTD_llt_literalLength) { -- seqLen.litLength += 0xFFFF; -+ seqLen.litLength += 0x10000; - } - if (seqStore->longLengthType == ZSTD_llt_matchLength) { -- seqLen.matchLength += 0xFFFF; -+ seqLen.matchLength += 0x10000; - } - } - return seqLen; -@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore - * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` - */ - typedef struct { -+ size_t nbBlocks; - size_t compressedSize; - unsigned long long decompressedBound; - } ZSTD_frameSizeInfo; /* decompress & legacy */ - - const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ --void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ -- --/* custom memory allocation functions */ --void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem); --void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem); --void ZSTD_customFree(void* ptr, ZSTD_customMem customMem); -- -- --MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ --{ -- assert(val != 0); -- { --# if (__GNUC__ >= 3) /* GCC Intrinsic */ -- return __builtin_clz (val) ^ 31; --# else /* Software version */ -- static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; -- U32 v = val; -- v |= v >> 1; -- v |= v >> 2; -- v |= v >> 4; -- v |= v >> 8; -- v |= v >> 16; -- return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; --# endif -- } --} -- --/* -- * Counts the number of trailing zeros of a `size_t`. -- * Most compilers should support CTZ as a builtin. A backup -- * implementation is provided if the builtin isn't supported, but -- * it may not be terribly efficient. -- */ --MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) --{ -- if (MEM_64bits()) { --# if (__GNUC__ >= 4) -- return __builtin_ctzll((U64)val); --# else -- static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19, -- 4, 25, 14, 28, 9, 34, 20, 56, -- 5, 17, 26, 54, 15, 41, 29, 43, -- 10, 31, 38, 35, 21, 45, 49, 57, -- 63, 6, 12, 18, 24, 27, 33, 55, -- 16, 53, 40, 42, 30, 37, 44, 48, -- 62, 11, 23, 32, 52, 39, 36, 47, -- 61, 22, 51, 46, 60, 50, 59, 58 }; -- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; --# endif -- } else { /* 32 bits */ --# if (__GNUC__ >= 3) -- return __builtin_ctz((U32)val); --# else -- static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, -- 30, 22, 20, 15, 25, 17, 4, 8, -- 31, 27, 13, 23, 21, 19, 16, 7, -- 26, 12, 18, 6, 11, 5, 10, 9 }; -- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; --# endif -- } --} -+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ - - - /* ZSTD_invalidateRepCodes() : -@@ -420,13 +357,13 @@ typedef struct { - - /*! ZSTD_getcBlockSize() : - * Provides the size of compressed block from block header `src` */ --/* Used by: decompress, fullbench (does not get its definition from here) */ -+/* Used by: decompress, fullbench */ - size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, - blockProperties_t* bpPtr); - - /*! ZSTD_decodeSeqHeaders() : - * decode sequence header from src */ --/* Used by: decompress, fullbench (does not get its definition from here) */ -+/* Used by: zstd_decompress_block, fullbench */ - size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, - const void* src, size_t srcSize); - -diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h -index d9a76112ec3a..6ab8be6532ef 100644 ---- a/lib/zstd/compress/clevels.h -+++ b/lib/zstd/compress/clevels.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c -index ec5b1ca6d71a..44a3c10becf2 100644 ---- a/lib/zstd/compress/fse_compress.c -+++ b/lib/zstd/compress/fse_compress.c -@@ -1,6 +1,7 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* ****************************************************************** - * FSE : Finite State Entropy encoder -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * You can contact the author at : - * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -25,7 +26,8 @@ - #include "../common/error_private.h" - #define ZSTD_DEPS_NEED_MALLOC - #define ZSTD_DEPS_NEED_MATH64 --#include "../common/zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */ -+#include "../common/zstd_deps.h" /* ZSTD_memset */ -+#include "../common/bits.h" /* ZSTD_highbit32 */ - - - /* ************************************************************** -@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, - assert(tableLog < 16); /* required for threshold strategy to work */ - - /* For explanations on how to distribute symbol values over the table : -- * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ -+ * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ - - #ifdef __clang_analyzer__ - ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ -@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, - break; - default : - assert(normalizedCounter[s] > 1); -- { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); -+ { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1); - U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; - symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; - symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); -@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) - size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog - + 4 /* bitCount initialized at 4 */ - + 2 /* first two symbols may use one additional bit each */) / 8) -- + 1 /* round up to whole nb bytes */ -- + 2 /* additional two bytes for bitstream flush */; -+ + 1 /* round up to whole nb bytes */ -+ + 2 /* additional two bytes for bitstream flush */; - return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ - } - -@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, - /* Init */ - remaining = tableSize+1; /* +1 for extra accuracy */ - threshold = tableSize; -- nbBits = tableLog+1; -+ nbBits = (int)tableLog+1; - - while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */ - if (previousIs0) { -@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, - } - while (symbol >= start+3) { - start+=3; -- bitStream += 3 << bitCount; -+ bitStream += 3U << bitCount; - bitCount += 2; - } - bitStream += (symbol-start) << bitCount; -@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, - count++; /* +1 for extra accuracy */ - if (count>=threshold) - count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ -- bitStream += count << bitCount; -+ bitStream += (U32)count << bitCount; - bitCount += nbBits; - bitCount -= (count>8); - out+= (bitCount+7) /8; - -- return (out-ostart); -+ assert(out >= ostart); -+ return (size_t)(out-ostart); - } - - -@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, - * FSE Compression Code - ****************************************************************/ - --FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) --{ -- size_t size; -- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; -- size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); -- return (FSE_CTable*)ZSTD_malloc(size); --} -- --void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); } -- - /* provides the minimum logSize to safely represent a distribution */ - static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) - { -- U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1; -- U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; -+ U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1; -+ U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2; - U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; - assert(srcSize > 1); /* Not supported, RLE should be used instead */ - return minBits; -@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) - - unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) - { -- U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; -+ U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus; - U32 tableLog = maxTableLog; - U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); - assert(srcSize > 1); /* Not supported, RLE should be used instead */ -@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, - return tableLog; - } - -- --/* fake FSE_CTable, for raw (uncompressed) input */ --size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits) --{ -- const unsigned tableSize = 1 << nbBits; -- const unsigned tableMask = tableSize - 1; -- const unsigned maxSymbolValue = tableMask; -- void* const ptr = ct; -- U16* const tableU16 = ( (U16*) ptr) + 2; -- void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */ -- FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); -- unsigned s; -- -- /* Sanity checks */ -- if (nbBits < 1) return ERROR(GENERIC); /* min size */ -- -- /* header */ -- tableU16[-2] = (U16) nbBits; -- tableU16[-1] = (U16) maxSymbolValue; -- -- /* Build table */ -- for (s=0; s= 2 -+ -+static size_t showU32(const U32* arr, size_t size) - { -- return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); -+ size_t u; -+ for (u=0; u= sizeof(HUF_WriteCTableWksp)); -+ -+ assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue); -+ assert(HUF_readCTableHeader(CTable).tableLog == huffLog); -+ - /* check conditions */ - if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); - if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); -@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, - return ((maxSymbolValue+1)/2) + 1; - } - --/*! HUF_writeCTable() : -- `CTable` : Huffman tree to save, using huf representation. -- @return : size of saved CTable */ --size_t HUF_writeCTable (void* dst, size_t maxDstSize, -- const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) --{ -- HUF_WriteCTableWksp wksp; -- return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp)); --} -- - - size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) - { -@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void - if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); - if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); - -- CTable[0] = tableLog; -+ *maxSymbolValuePtr = nbSymbols - 1; -+ -+ HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr); - - /* Prepare base value per rank */ - { U32 n, nextRankStart = 0; -@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void - { U32 n; for (n=0; n HUF_readCTableHeader(CTable).maxSymbolValue) -+ return 0; - return (U32)HUF_getNbBits(ct[symbolValue]); - } - - --typedef struct nodeElt_s { -- U32 count; -- U16 parent; -- BYTE byte; -- BYTE nbBits; --} nodeElt; -- - /* - * HUF_setMaxHeight(): -- * Enforces maxNbBits on the Huffman tree described in huffNode. -+ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode. - * -- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts -- * the tree to so that it is a valid canonical Huffman tree. -+ * It attempts to convert all nodes with nbBits > @targetNbBits -+ * to employ @targetNbBits instead. Then it adjusts the tree -+ * so that it remains a valid canonical Huffman tree. - * - * @pre The sum of the ranks of each symbol == 2^largestBits, - * where largestBits == huffNode[lastNonNull].nbBits. - * @post The sum of the ranks of each symbol == 2^largestBits, -- * where largestBits is the return value <= maxNbBits. -+ * where largestBits is the return value (expected <= targetNbBits). - * -- * @param huffNode The Huffman tree modified in place to enforce maxNbBits. -+ * @param huffNode The Huffman tree modified in place to enforce targetNbBits. -+ * It's presumed sorted, from most frequent to rarest symbol. - * @param lastNonNull The symbol with the lowest count in the Huffman tree. -- * @param maxNbBits The maximum allowed number of bits, which the Huffman tree -+ * @param targetNbBits The allowed number of bits, which the Huffman tree - * may not respect. After this function the Huffman tree will -- * respect maxNbBits. -- * @return The maximum number of bits of the Huffman tree after adjustment, -- * necessarily no more than maxNbBits. -+ * respect targetNbBits. -+ * @return The maximum number of bits of the Huffman tree after adjustment. - */ --static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) -+static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) - { - const U32 largestBits = huffNode[lastNonNull].nbBits; -- /* early exit : no elt > maxNbBits, so the tree is already valid. */ -- if (largestBits <= maxNbBits) return largestBits; -+ /* early exit : no elt > targetNbBits, so the tree is already valid. */ -+ if (largestBits <= targetNbBits) return largestBits; -+ -+ DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits); - - /* there are several too large elements (at least >= 2) */ - { int totalCost = 0; -- const U32 baseCost = 1 << (largestBits - maxNbBits); -+ const U32 baseCost = 1 << (largestBits - targetNbBits); - int n = (int)lastNonNull; - -- /* Adjust any ranks > maxNbBits to maxNbBits. -+ /* Adjust any ranks > targetNbBits to targetNbBits. - * Compute totalCost, which is how far the sum of the ranks is - * we are over 2^largestBits after adjust the offending ranks. - */ -- while (huffNode[n].nbBits > maxNbBits) { -+ while (huffNode[n].nbBits > targetNbBits) { - totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); -- huffNode[n].nbBits = (BYTE)maxNbBits; -+ huffNode[n].nbBits = (BYTE)targetNbBits; - n--; - } -- /* n stops at huffNode[n].nbBits <= maxNbBits */ -- assert(huffNode[n].nbBits <= maxNbBits); -- /* n end at index of smallest symbol using < maxNbBits */ -- while (huffNode[n].nbBits == maxNbBits) --n; -+ /* n stops at huffNode[n].nbBits <= targetNbBits */ -+ assert(huffNode[n].nbBits <= targetNbBits); -+ /* n end at index of smallest symbol using < targetNbBits */ -+ while (huffNode[n].nbBits == targetNbBits) --n; - -- /* renorm totalCost from 2^largestBits to 2^maxNbBits -+ /* renorm totalCost from 2^largestBits to 2^targetNbBits - * note : totalCost is necessarily a multiple of baseCost */ -- assert((totalCost & (baseCost - 1)) == 0); -- totalCost >>= (largestBits - maxNbBits); -+ assert(((U32)totalCost & (baseCost - 1)) == 0); -+ totalCost >>= (largestBits - targetNbBits); - assert(totalCost > 0); - - /* repay normalized cost */ -@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) - - /* Get pos of last (smallest = lowest cum. count) symbol per rank */ - ZSTD_memset(rankLast, 0xF0, sizeof(rankLast)); -- { U32 currentNbBits = maxNbBits; -+ { U32 currentNbBits = targetNbBits; - int pos; - for (pos=n ; pos >= 0; pos--) { - if (huffNode[pos].nbBits >= currentNbBits) continue; -- currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ -- rankLast[maxNbBits-currentNbBits] = (U32)pos; -+ currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */ -+ rankLast[targetNbBits-currentNbBits] = (U32)pos; - } } - - while (totalCost > 0) { - /* Try to reduce the next power of 2 above totalCost because we - * gain back half the rank. - */ -- U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1; -+ U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1; - for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { - U32 const highPos = rankLast[nBitsToDecrease]; - U32 const lowPos = rankLast[nBitsToDecrease-1]; -@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) - rankLast[nBitsToDecrease] = noSymbol; - else { - rankLast[nBitsToDecrease]--; -- if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) -+ if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease) - rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ - } - } /* while (totalCost > 0) */ -@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) - * TODO. - */ - while (totalCost < 0) { /* Sometimes, cost correction overshoot */ -- /* special case : no rank 1 symbol (using maxNbBits-1); -- * let's create one from largest rank 0 (using maxNbBits). -+ /* special case : no rank 1 symbol (using targetNbBits-1); -+ * let's create one from largest rank 0 (using targetNbBits). - */ - if (rankLast[1] == noSymbol) { -- while (huffNode[n].nbBits == maxNbBits) n--; -+ while (huffNode[n].nbBits == targetNbBits) n--; - huffNode[n+1].nbBits--; - assert(n >= 0); - rankLast[1] = (U32)(n+1); -@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) - } /* repay normalized cost */ - } /* there are several too large elements (at least >= 2) */ - -- return maxNbBits; -+ return targetNbBits; - } - - typedef struct { -@@ -429,7 +500,7 @@ typedef struct { - U16 curr; - } rankPos; - --typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; -+typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)]; - - /* Number of buckets available for HUF_sort() */ - #define RANK_POSITION_TABLE_SIZE 192 -@@ -448,8 +519,8 @@ typedef struct { - * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. - */ - #define RANK_POSITION_MAX_COUNT_LOG 32 --#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */ --#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */ -+#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */) -+#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */) - - /* Return the appropriate bucket index for a given count. See definition of - * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. -@@ -457,7 +528,7 @@ typedef struct { - static U32 HUF_getIndex(U32 const count) { - return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) - ? count -- : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; -+ : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; - } - - /* Helper swap function for HUF_quickSortPartition() */ -@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy - - /* Sort each bucket. */ - for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { -- U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; -+ int const bucketSize = rankPosition[n].curr - rankPosition[n].base; - U32 const bucketStartIdx = rankPosition[n].base; - if (bucketSize > 1) { - assert(bucketStartIdx < maxSymbolValue1); -@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy - assert(HUF_isSorted(huffNode, maxSymbolValue1)); - } - -+ - /* HUF_buildCTable_wksp() : - * Same as HUF_buildCTable(), but using externally allocated scratch buffer. - * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). -@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) - int lowS, lowN; - int nodeNb = STARTNODE; - int n, nodeRoot; -+ DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1); - /* init for parents */ - nonNullRank = (int)maxSymbolValue; - while(huffNode[nonNullRank].count == 0) nonNullRank--; -@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) - for (n=0; n<=nonNullRank; n++) - huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; - -+ DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1)); -+ - return nonNullRank; - } - -@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i - HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */ - for (n=0; nhuffNodeTbl; - nodeElt* const huffNode = huffNode0+1; - int nonNullRank; - -+ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables)); -+ -+ DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1); -+ - /* safety checks */ - if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) -- return ERROR(workSpace_tooSmall); -+ return ERROR(workSpace_tooSmall); - if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; - if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) -- return ERROR(maxSymbolValue_tooLarge); -+ return ERROR(maxSymbolValue_tooLarge); - ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable)); - - /* sort, decreasing order */ - HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); -+ DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1)); - - /* build tree */ - nonNullRank = HUF_buildTree(huffNode, maxSymbolValue); - -- /* enforce maxTableLog */ -+ /* determine and enforce maxTableLog */ - maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); - if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ - -@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, - } - - int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { -- HUF_CElt const* ct = CTable + 1; -- int bad = 0; -- int s; -- for (s = 0; s <= (int)maxSymbolValue; ++s) { -- bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); -- } -- return !bad; -+ HUF_CTableHeader header = HUF_readCTableHeader(CTable); -+ HUF_CElt const* ct = CTable + 1; -+ int bad = 0; -+ int s; -+ -+ assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX); -+ -+ if (header.maxSymbolValue < maxSymbolValue) -+ return 0; -+ -+ for (s = 0; s <= (int)maxSymbolValue; ++s) { -+ bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); -+ } -+ return !bad; - } - - size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } -@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id - #if DEBUGLEVEL >= 1 - { - size_t const nbBits = HUF_getNbBits(elt); -- size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; -+ size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1; - (void)dirtyBits; - /* Middle bits are 0. */ - assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); -@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC) - { - size_t const nbBits = bitC->bitPos[0] & 0xFF; - if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ -- return (bitC->ptr - bitC->startPtr) + (nbBits > 0); -+ return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0); - } - } - -@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, - const void* src, size_t srcSize, - const HUF_CElt* CTable) - { -- U32 const tableLog = (U32)CTable[0]; -+ U32 const tableLog = HUF_readCTableHeader(CTable).tableLog; - HUF_CElt const* ct = CTable + 1; - const BYTE* ip = (const BYTE*) src; - BYTE* const ostart = (BYTE*)dst; - BYTE* const oend = ostart + dstSize; -- BYTE* op = ostart; - HUF_CStream_t bitC; - - /* init */ - if (dstSize < 8) return 0; /* not enough space to compress */ -- { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); -+ { BYTE* op = ostart; -+ size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); - if (HUF_isError(initErr)) return 0; } - - if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) -@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, - static size_t - HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, - const void* src, size_t srcSize, -- const HUF_CElt* CTable, const int bmi2) -+ const HUF_CElt* CTable, const int flags) - { -- if (bmi2) { -+ if (flags & HUF_flags_bmi2) { - return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); - } - return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); -@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, - static size_t - HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, - const void* src, size_t srcSize, -- const HUF_CElt* CTable, const int bmi2) -+ const HUF_CElt* CTable, const int flags) - { -- (void)bmi2; -+ (void)flags; - return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); - } - - #endif - --size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) -+size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) - { -- return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); --} -- --size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) --{ -- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); -+ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); - } - - static size_t - HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, - const void* src, size_t srcSize, -- const HUF_CElt* CTable, int bmi2) -+ const HUF_CElt* CTable, int flags) - { - size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ - const BYTE* ip = (const BYTE*) src; -@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, - op += 6; /* jumpTable */ - - assert(op <= oend); -- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); -+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); - if (cSize == 0 || cSize > 65535) return 0; - MEM_writeLE16(ostart, (U16)cSize); - op += cSize; -@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, - - ip += segmentSize; - assert(op <= oend); -- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); -+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); - if (cSize == 0 || cSize > 65535) return 0; - MEM_writeLE16(ostart+2, (U16)cSize); - op += cSize; -@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, - - ip += segmentSize; - assert(op <= oend); -- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); -+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); - if (cSize == 0 || cSize > 65535) return 0; - MEM_writeLE16(ostart+4, (U16)cSize); - op += cSize; -@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, - ip += segmentSize; - assert(op <= oend); - assert(ip <= iend); -- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); -+ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) ); - if (cSize == 0 || cSize > 65535) return 0; - op += cSize; - } -@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, - return (size_t)(op-ostart); - } - --size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) --{ -- return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); --} -- --size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) -+size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) - { -- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); -+ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); - } - - typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; -@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; - static size_t HUF_compressCTable_internal( - BYTE* const ostart, BYTE* op, BYTE* const oend, - const void* src, size_t srcSize, -- HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) -+ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) - { - size_t const cSize = (nbStreams==HUF_singleStream) ? -- HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) : -- HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2); -+ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : -+ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags); - if (HUF_isError(cSize)) { return cSize; } - if (cSize==0) { return 0; } /* uncompressible */ - op += cSize; -@@ -1168,6 +1249,81 @@ typedef struct { - #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 - #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ - -+unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) -+{ -+ unsigned cardinality = 0; -+ unsigned i; -+ -+ for (i = 0; i < maxSymbolValue + 1; i++) { -+ if (count[i] != 0) cardinality += 1; -+ } -+ -+ return cardinality; -+} -+ -+unsigned HUF_minTableLog(unsigned symbolCardinality) -+{ -+ U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1; -+ return minBitsSymbols; -+} -+ -+unsigned HUF_optimalTableLog( -+ unsigned maxTableLog, -+ size_t srcSize, -+ unsigned maxSymbolValue, -+ void* workSpace, size_t wkspSize, -+ HUF_CElt* table, -+ const unsigned* count, -+ int flags) -+{ -+ assert(srcSize > 1); /* Not supported, RLE should be used instead */ -+ assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables)); -+ -+ if (!(flags & HUF_flags_optimalDepth)) { -+ /* cheap evaluation, based on FSE */ -+ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); -+ } -+ -+ { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); -+ size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); -+ size_t hSize, newSize; -+ const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); -+ const unsigned minTableLog = HUF_minTableLog(symbolCardinality); -+ size_t optSize = ((size_t) ~0) - 1; -+ unsigned optLog = maxTableLog, optLogGuess; -+ -+ DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize); -+ -+ /* Search until size increases */ -+ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { -+ DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); -+ -+ { size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); -+ if (ERR_isError(maxBits)) continue; -+ -+ if (maxBits < optLogGuess && optLogGuess > minTableLog) break; -+ -+ hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); -+ } -+ -+ if (ERR_isError(hSize)) continue; -+ -+ newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize; -+ -+ if (newSize > optSize + 1) { -+ break; -+ } -+ -+ if (newSize < optSize) { -+ optSize = newSize; -+ optLog = optLogGuess; -+ } -+ } -+ assert(optLog <= HUF_TABLELOG_MAX); -+ return optLog; -+ } -+} -+ - /* HUF_compress_internal() : - * `workSpace_align4` must be aligned on 4-bytes boundaries, - * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ -@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t dstSize, - unsigned maxSymbolValue, unsigned huffLog, - HUF_nbStreams_e nbStreams, - void* workSpace, size_t wkspSize, -- HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, -- const int bmi2, unsigned suspectUncompressible) -+ HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) - { - HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); - BYTE* const ostart = (BYTE*)dst; - BYTE* const oend = ostart + dstSize; - BYTE* op = ostart; - -+ DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize); - HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); - - /* checks & inits */ -@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t dstSize, - if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; - - /* Heuristic : If old table is valid, use it for small inputs */ -- if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { -+ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) { - return HUF_compressCTable_internal(ostart, op, oend, - src, srcSize, -- nbStreams, oldHufTable, bmi2); -+ nbStreams, oldHufTable, flags); - } - - /* If uncompressible data is suspected, do a smaller sampling first */ - DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); -- if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { -+ if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { - size_t largestTotal = 0; -+ DEBUGLOG(5, "input suspected incompressible : sampling to check"); - { unsigned maxSymbolValueBegin = maxSymbolValue; - CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); - largestTotal += largestBegin; -@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t dstSize, - if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ - if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ - } -+ DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1)); - - /* Check validity of previous table */ - if ( repeat -@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize, - *repeat = HUF_repeat_none; - } - /* Heuristic : use existing table for small inputs */ -- if (preferRepeat && repeat && *repeat != HUF_repeat_none) { -+ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) { - return HUF_compressCTable_internal(ostart, op, oend, - src, srcSize, -- nbStreams, oldHufTable, bmi2); -+ nbStreams, oldHufTable, flags); - } - - /* Build Huffman Tree */ -- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); -+ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags); - { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, - maxSymbolValue, huffLog, - &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); - CHECK_F(maxBits); - huffLog = (U32)maxBits; -- } -- /* Zero unused symbols in CTable, so we can check it for validity */ -- { -- size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue); -- size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt); -- ZSTD_memset(table->CTable + ctableSize, 0, unusedSize); -+ DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); - } - - /* Write table description header */ -@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize, - if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { - return HUF_compressCTable_internal(ostart, op, oend, - src, srcSize, -- nbStreams, oldHufTable, bmi2); -+ nbStreams, oldHufTable, flags); - } } - - /* Use the new huffman table */ -@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize, - } - return HUF_compressCTable_internal(ostart, op, oend, - src, srcSize, -- nbStreams, table->CTable, bmi2); --} -- -- --size_t HUF_compress1X_wksp (void* dst, size_t dstSize, -- const void* src, size_t srcSize, -- unsigned maxSymbolValue, unsigned huffLog, -- void* workSpace, size_t wkspSize) --{ -- return HUF_compress_internal(dst, dstSize, src, srcSize, -- maxSymbolValue, huffLog, HUF_singleStream, -- workSpace, wkspSize, -- NULL, NULL, 0, 0 /*bmi2*/, 0); -+ nbStreams, table->CTable, flags); - } - - size_t HUF_compress1X_repeat (void* dst, size_t dstSize, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned huffLog, - void* workSpace, size_t wkspSize, -- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, -- int bmi2, unsigned suspectUncompressible) -+ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) - { -+ DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); - return HUF_compress_internal(dst, dstSize, src, srcSize, - maxSymbolValue, huffLog, HUF_singleStream, - workSpace, wkspSize, hufTable, -- repeat, preferRepeat, bmi2, suspectUncompressible); --} -- --/* HUF_compress4X_repeat(): -- * compress input using 4 streams. -- * provide workspace to generate compression tables */ --size_t HUF_compress4X_wksp (void* dst, size_t dstSize, -- const void* src, size_t srcSize, -- unsigned maxSymbolValue, unsigned huffLog, -- void* workSpace, size_t wkspSize) --{ -- return HUF_compress_internal(dst, dstSize, src, srcSize, -- maxSymbolValue, huffLog, HUF_fourStreams, -- workSpace, wkspSize, -- NULL, NULL, 0, 0 /*bmi2*/, 0); -+ repeat, flags); - } - - /* HUF_compress4X_repeat(): - * compress input using 4 streams. - * consider skipping quickly -- * re-use an existing huffman compression table */ -+ * reuse an existing huffman compression table */ - size_t HUF_compress4X_repeat (void* dst, size_t dstSize, - const void* src, size_t srcSize, - unsigned maxSymbolValue, unsigned huffLog, - void* workSpace, size_t wkspSize, -- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible) -+ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) - { -+ DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); - return HUF_compress_internal(dst, dstSize, src, srcSize, - maxSymbolValue, huffLog, HUF_fourStreams, - workSpace, wkspSize, -- hufTable, repeat, preferRepeat, bmi2, suspectUncompressible); -+ hufTable, repeat, flags); - } -- -diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c -index 16bb995bc6c4..885167f7e47b 100644 ---- a/lib/zstd/compress/zstd_compress.c -+++ b/lib/zstd/compress/zstd_compress.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -11,12 +12,12 @@ - /*-************************************* - * Dependencies - ***************************************/ -+#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ - #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ - #include "../common/mem.h" - #include "hist.h" /* HIST_countFast_wksp */ - #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ - #include "../common/fse.h" --#define HUF_STATIC_LINKING_ONLY - #include "../common/huf.h" - #include "zstd_compress_internal.h" - #include "zstd_compress_sequences.h" -@@ -27,6 +28,7 @@ - #include "zstd_opt.h" - #include "zstd_ldm.h" - #include "zstd_compress_superblock.h" -+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ - - /* *************************************************************** - * Tuning parameters -@@ -55,14 +57,17 @@ - * Helper functions - ***************************************/ - /* ZSTD_compressBound() -- * Note that the result from this function is only compatible with the "normal" -- * full-block strategy. -- * When there are a lot of small blocks due to frequent flush in streaming mode -- * the overhead of headers can make the compressed data to be larger than the -- * return value of ZSTD_compressBound(). -+ * Note that the result from this function is only valid for -+ * the one-pass compression functions. -+ * When employing the streaming mode, -+ * if flushes are frequently altering the size of blocks, -+ * the overhead from block headers can make the compressed data larger -+ * than the return value of ZSTD_compressBound(). - */ - size_t ZSTD_compressBound(size_t srcSize) { -- return ZSTD_COMPRESSBOUND(srcSize); -+ size_t const r = ZSTD_COMPRESSBOUND(srcSize); -+ if (r==0) return ERROR(srcSize_wrong); -+ return r; - } - - -@@ -168,15 +173,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) - - size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) - { -+ DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx); - if (cctx==NULL) return 0; /* support free on NULL */ - RETURN_ERROR_IF(cctx->staticSize, memory_allocation, - "not compatible with static CCtx"); -- { -- int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); -+ { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); - ZSTD_freeCCtxContent(cctx); -- if (!cctxInWorkspace) { -- ZSTD_customFree(cctx, cctx->customMem); -- } -+ if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem); - } - return 0; - } -@@ -257,9 +260,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, - return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); - } - --/* Returns 1 if compression parameters are such that we should -+/* Returns ZSTD_ps_enable if compression parameters are such that we should - * enable long distance matching (wlog >= 27, strategy >= btopt). -- * Returns 0 otherwise. -+ * Returns ZSTD_ps_disable otherwise. - */ - static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, - const ZSTD_compressionParameters* const cParams) { -@@ -267,6 +270,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, - return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; - } - -+static int ZSTD_resolveExternalSequenceValidation(int mode) { -+ return mode; -+} -+ -+/* Resolves maxBlockSize to the default if no value is present. */ -+static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) { -+ if (maxBlockSize == 0) { -+ return ZSTD_BLOCKSIZE_MAX; -+ } else { -+ return maxBlockSize; -+ } -+} -+ -+static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) { -+ if (value != ZSTD_ps_auto) return value; -+ if (cLevel < 10) { -+ return ZSTD_ps_disable; -+ } else { -+ return ZSTD_ps_enable; -+ } -+} -+ -+/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. -+ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ -+static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { -+ return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast; -+} -+ - static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( - ZSTD_compressionParameters cParams) - { -@@ -284,6 +315,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( - } - cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); - cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); -+ cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); -+ cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); -+ cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes, -+ cctxParams.compressionLevel); - assert(!ZSTD_checkCParams(cParams)); - return cctxParams; - } -@@ -329,10 +364,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) - #define ZSTD_NO_CLEVEL 0 - - /* -- * Initializes the cctxParams from params and compressionLevel. -+ * Initializes `cctxParams` from `params` and `compressionLevel`. - * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. - */ --static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel) -+static void -+ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, -+ const ZSTD_parameters* params, -+ int compressionLevel) - { - assert(!ZSTD_checkCParams(params->cParams)); - ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); -@@ -345,6 +383,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par - cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); - cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); - cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); -+ cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); -+ cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); -+ cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); - DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", - cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); - } -@@ -359,7 +400,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete - - /* - * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. -- * @param param Validated zstd parameters. -+ * @param params Validated zstd parameters. - */ - static void ZSTD_CCtxParams_setZstdParams( - ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) -@@ -455,8 +496,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) - return bounds; - - case ZSTD_c_enableLongDistanceMatching: -- bounds.lowerBound = 0; -- bounds.upperBound = 1; -+ bounds.lowerBound = (int)ZSTD_ps_auto; -+ bounds.upperBound = (int)ZSTD_ps_disable; - return bounds; - - case ZSTD_c_ldmHashLog: -@@ -549,6 +590,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) - bounds.upperBound = 1; - return bounds; - -+ case ZSTD_c_prefetchCDictTables: -+ bounds.lowerBound = (int)ZSTD_ps_auto; -+ bounds.upperBound = (int)ZSTD_ps_disable; -+ return bounds; -+ -+ case ZSTD_c_enableSeqProducerFallback: -+ bounds.lowerBound = 0; -+ bounds.upperBound = 1; -+ return bounds; -+ -+ case ZSTD_c_maxBlockSize: -+ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; -+ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; -+ return bounds; -+ -+ case ZSTD_c_searchForExternalRepcodes: -+ bounds.lowerBound = (int)ZSTD_ps_auto; -+ bounds.upperBound = (int)ZSTD_ps_disable; -+ return bounds; -+ - default: - bounds.error = ERROR(parameter_unsupported); - return bounds; -@@ -567,10 +628,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) - return 0; - } - --#define BOUNDCHECK(cParam, val) { \ -- RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ -- parameter_outOfBound, "Param out of bounds"); \ --} -+#define BOUNDCHECK(cParam, val) \ -+ do { \ -+ RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ -+ parameter_outOfBound, "Param out of bounds"); \ -+ } while (0) - - - static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) -@@ -613,6 +675,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) - case ZSTD_c_useBlockSplitter: - case ZSTD_c_useRowMatchFinder: - case ZSTD_c_deterministicRefPrefix: -+ case ZSTD_c_prefetchCDictTables: -+ case ZSTD_c_enableSeqProducerFallback: -+ case ZSTD_c_maxBlockSize: -+ case ZSTD_c_searchForExternalRepcodes: - default: - return 0; - } -@@ -625,7 +691,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) - if (ZSTD_isUpdateAuthorized(param)) { - cctx->cParamsChanged = 1; - } else { -- RETURN_ERROR(stage_wrong, "can only set params in ctx init stage"); -+ RETURN_ERROR(stage_wrong, "can only set params in cctx init stage"); - } } - - switch(param) -@@ -668,6 +734,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) - case ZSTD_c_useBlockSplitter: - case ZSTD_c_useRowMatchFinder: - case ZSTD_c_deterministicRefPrefix: -+ case ZSTD_c_prefetchCDictTables: -+ case ZSTD_c_enableSeqProducerFallback: -+ case ZSTD_c_maxBlockSize: -+ case ZSTD_c_searchForExternalRepcodes: - break; - - default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); -@@ -723,12 +793,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, - case ZSTD_c_minMatch : - if (value!=0) /* 0 => use default */ - BOUNDCHECK(ZSTD_c_minMatch, value); -- CCtxParams->cParams.minMatch = value; -+ CCtxParams->cParams.minMatch = (U32)value; - return CCtxParams->cParams.minMatch; - - case ZSTD_c_targetLength : - BOUNDCHECK(ZSTD_c_targetLength, value); -- CCtxParams->cParams.targetLength = value; -+ CCtxParams->cParams.targetLength = (U32)value; - return CCtxParams->cParams.targetLength; - - case ZSTD_c_strategy : -@@ -741,12 +811,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, - /* Content size written in frame header _when known_ (default:1) */ - DEBUGLOG(4, "set content size flag = %u", (value!=0)); - CCtxParams->fParams.contentSizeFlag = value != 0; -- return CCtxParams->fParams.contentSizeFlag; -+ return (size_t)CCtxParams->fParams.contentSizeFlag; - - case ZSTD_c_checksumFlag : - /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ - CCtxParams->fParams.checksumFlag = value != 0; -- return CCtxParams->fParams.checksumFlag; -+ return (size_t)CCtxParams->fParams.checksumFlag; - - case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ - DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); -@@ -755,18 +825,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, - - case ZSTD_c_forceMaxWindow : - CCtxParams->forceWindow = (value != 0); -- return CCtxParams->forceWindow; -+ return (size_t)CCtxParams->forceWindow; - - case ZSTD_c_forceAttachDict : { - const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; -- BOUNDCHECK(ZSTD_c_forceAttachDict, pref); -+ BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); - CCtxParams->attachDictPref = pref; - return CCtxParams->attachDictPref; - } - - case ZSTD_c_literalCompressionMode : { - const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; -- BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); -+ BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); - CCtxParams->literalCompressionMode = lcm; - return CCtxParams->literalCompressionMode; - } -@@ -789,47 +859,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, - - case ZSTD_c_enableDedicatedDictSearch : - CCtxParams->enableDedicatedDictSearch = (value!=0); -- return CCtxParams->enableDedicatedDictSearch; -+ return (size_t)CCtxParams->enableDedicatedDictSearch; - - case ZSTD_c_enableLongDistanceMatching : -+ BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); - CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; - return CCtxParams->ldmParams.enableLdm; - - case ZSTD_c_ldmHashLog : - if (value!=0) /* 0 ==> auto */ - BOUNDCHECK(ZSTD_c_ldmHashLog, value); -- CCtxParams->ldmParams.hashLog = value; -+ CCtxParams->ldmParams.hashLog = (U32)value; - return CCtxParams->ldmParams.hashLog; - - case ZSTD_c_ldmMinMatch : - if (value!=0) /* 0 ==> default */ - BOUNDCHECK(ZSTD_c_ldmMinMatch, value); -- CCtxParams->ldmParams.minMatchLength = value; -+ CCtxParams->ldmParams.minMatchLength = (U32)value; - return CCtxParams->ldmParams.minMatchLength; - - case ZSTD_c_ldmBucketSizeLog : - if (value!=0) /* 0 ==> default */ - BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); -- CCtxParams->ldmParams.bucketSizeLog = value; -+ CCtxParams->ldmParams.bucketSizeLog = (U32)value; - return CCtxParams->ldmParams.bucketSizeLog; - - case ZSTD_c_ldmHashRateLog : - if (value!=0) /* 0 ==> default */ - BOUNDCHECK(ZSTD_c_ldmHashRateLog, value); -- CCtxParams->ldmParams.hashRateLog = value; -+ CCtxParams->ldmParams.hashRateLog = (U32)value; - return CCtxParams->ldmParams.hashRateLog; - - case ZSTD_c_targetCBlockSize : -- if (value!=0) /* 0 ==> default */ -+ if (value!=0) { /* 0 ==> default */ -+ value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN); - BOUNDCHECK(ZSTD_c_targetCBlockSize, value); -- CCtxParams->targetCBlockSize = value; -+ } -+ CCtxParams->targetCBlockSize = (U32)value; - return CCtxParams->targetCBlockSize; - - case ZSTD_c_srcSizeHint : - if (value!=0) /* 0 ==> default */ - BOUNDCHECK(ZSTD_c_srcSizeHint, value); - CCtxParams->srcSizeHint = value; -- return CCtxParams->srcSizeHint; -+ return (size_t)CCtxParams->srcSizeHint; - - case ZSTD_c_stableInBuffer: - BOUNDCHECK(ZSTD_c_stableInBuffer, value); -@@ -849,7 +922,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, - case ZSTD_c_validateSequences: - BOUNDCHECK(ZSTD_c_validateSequences, value); - CCtxParams->validateSequences = value; -- return CCtxParams->validateSequences; -+ return (size_t)CCtxParams->validateSequences; - - case ZSTD_c_useBlockSplitter: - BOUNDCHECK(ZSTD_c_useBlockSplitter, value); -@@ -864,7 +937,28 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, - case ZSTD_c_deterministicRefPrefix: - BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); - CCtxParams->deterministicRefPrefix = !!value; -- return CCtxParams->deterministicRefPrefix; -+ return (size_t)CCtxParams->deterministicRefPrefix; -+ -+ case ZSTD_c_prefetchCDictTables: -+ BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); -+ CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value; -+ return CCtxParams->prefetchCDictTables; -+ -+ case ZSTD_c_enableSeqProducerFallback: -+ BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); -+ CCtxParams->enableMatchFinderFallback = value; -+ return (size_t)CCtxParams->enableMatchFinderFallback; -+ -+ case ZSTD_c_maxBlockSize: -+ if (value!=0) /* 0 ==> default */ -+ BOUNDCHECK(ZSTD_c_maxBlockSize, value); -+ CCtxParams->maxBlockSize = value; -+ return CCtxParams->maxBlockSize; -+ -+ case ZSTD_c_searchForExternalRepcodes: -+ BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value); -+ CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value; -+ return CCtxParams->searchForExternalRepcodes; - - default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); - } -@@ -980,6 +1074,18 @@ size_t ZSTD_CCtxParams_getParameter( - case ZSTD_c_deterministicRefPrefix: - *value = (int)CCtxParams->deterministicRefPrefix; - break; -+ case ZSTD_c_prefetchCDictTables: -+ *value = (int)CCtxParams->prefetchCDictTables; -+ break; -+ case ZSTD_c_enableSeqProducerFallback: -+ *value = CCtxParams->enableMatchFinderFallback; -+ break; -+ case ZSTD_c_maxBlockSize: -+ *value = (int)CCtxParams->maxBlockSize; -+ break; -+ case ZSTD_c_searchForExternalRepcodes: -+ *value = (int)CCtxParams->searchForExternalRepcodes; -+ break; - default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); - } - return 0; -@@ -1006,9 +1112,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( - return 0; - } - -+size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) -+{ -+ ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */); -+ DEBUGLOG(4, "ZSTD_CCtx_setCParams"); -+ /* only update if all parameters are valid */ -+ FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), ""); -+ return 0; -+} -+ -+size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams) -+{ -+ ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */); -+ DEBUGLOG(4, "ZSTD_CCtx_setFParams"); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), ""); -+ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), ""); -+ return 0; -+} -+ -+size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params) -+{ -+ DEBUGLOG(4, "ZSTD_CCtx_setParams"); -+ /* First check cParams, because we want to update all or none. */ -+ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); -+ /* Next set fParams, because this could fail if the cctx isn't in init stage. */ -+ FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), ""); -+ /* Finally set cParams, which should succeed. */ -+ FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), ""); -+ return 0; -+} -+ - size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) - { -- DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); -+ DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize); - RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, - "Can't set pledgedSrcSize when not in init stage."); - cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; -@@ -1024,9 +1168,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams( - ZSTD_compressionParameters* cParams); - - /* -- * Initializes the local dict using the requested parameters. -- * NOTE: This does not use the pledged src size, because it may be used for more -- * than one compression. -+ * Initializes the local dictionary using requested parameters. -+ * NOTE: Initialization does not employ the pledged src size, -+ * because the dictionary may be used for multiple compressions. - */ - static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) - { -@@ -1039,8 +1183,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) - return 0; - } - if (dl->cdict != NULL) { -- assert(cctx->cdict == dl->cdict); - /* Local dictionary already initialized. */ -+ assert(cctx->cdict == dl->cdict); - return 0; - } - assert(dl->dictSize > 0); -@@ -1060,26 +1204,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) - } - - size_t ZSTD_CCtx_loadDictionary_advanced( -- ZSTD_CCtx* cctx, const void* dict, size_t dictSize, -- ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) -+ ZSTD_CCtx* cctx, -+ const void* dict, size_t dictSize, -+ ZSTD_dictLoadMethod_e dictLoadMethod, -+ ZSTD_dictContentType_e dictContentType) - { -- RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, -- "Can't load a dictionary when ctx is not in init stage."); - DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); -- ZSTD_clearAllDicts(cctx); /* in case one already exists */ -- if (dict == NULL || dictSize == 0) /* no dictionary mode */ -+ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, -+ "Can't load a dictionary when cctx is not in init stage."); -+ ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */ -+ if (dict == NULL || dictSize == 0) /* no dictionary */ - return 0; - if (dictLoadMethod == ZSTD_dlm_byRef) { - cctx->localDict.dict = dict; - } else { -+ /* copy dictionary content inside CCtx to own its lifetime */ - void* dictBuffer; - RETURN_ERROR_IF(cctx->staticSize, memory_allocation, -- "no malloc for static CCtx"); -+ "static CCtx can't allocate for an internal copy of dictionary"); - dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem); -- RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!"); -+ RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation, -+ "allocation failed for dictionary content"); - ZSTD_memcpy(dictBuffer, dict, dictSize); -- cctx->localDict.dictBuffer = dictBuffer; -- cctx->localDict.dict = dictBuffer; -+ cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */ -+ cctx->localDict.dict = dictBuffer; /* read-only reference */ - } - cctx->localDict.dictSize = dictSize; - cctx->localDict.dictContentType = dictContentType; -@@ -1149,7 +1297,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) - if ( (reset == ZSTD_reset_parameters) - || (reset == ZSTD_reset_session_and_parameters) ) { - RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, -- "Can't reset parameters only when not in init stage."); -+ "Reset parameters is only possible during init stage."); - ZSTD_clearAllDicts(cctx); - return ZSTD_CCtxParams_reset(&cctx->requestedParams); - } -@@ -1178,11 +1326,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) - static ZSTD_compressionParameters - ZSTD_clampCParams(ZSTD_compressionParameters cParams) - { --# define CLAMP_TYPE(cParam, val, type) { \ -- ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ -- if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ -- } -+# define CLAMP_TYPE(cParam, val, type) \ -+ do { \ -+ ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ -+ if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ -+ } while (0) - # define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) - CLAMP(ZSTD_c_windowLog, cParams.windowLog); - CLAMP(ZSTD_c_chainLog, cParams.chainLog); -@@ -1247,12 +1396,55 @@ static ZSTD_compressionParameters - ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, - unsigned long long srcSize, - size_t dictSize, -- ZSTD_cParamMode_e mode) -+ ZSTD_cParamMode_e mode, -+ ZSTD_paramSwitch_e useRowMatchFinder) - { - const U64 minSrcSize = 513; /* (1<<9) + 1 */ - const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); - assert(ZSTD_checkCParams(cPar)==0); - -+ /* Cascade the selected strategy down to the next-highest one built into -+ * this binary. */ -+#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR -+ if (cPar.strategy == ZSTD_btultra2) { -+ cPar.strategy = ZSTD_btultra; -+ } -+ if (cPar.strategy == ZSTD_btultra) { -+ cPar.strategy = ZSTD_btopt; -+ } -+#endif -+#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR -+ if (cPar.strategy == ZSTD_btopt) { -+ cPar.strategy = ZSTD_btlazy2; -+ } -+#endif -+#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR -+ if (cPar.strategy == ZSTD_btlazy2) { -+ cPar.strategy = ZSTD_lazy2; -+ } -+#endif -+#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR -+ if (cPar.strategy == ZSTD_lazy2) { -+ cPar.strategy = ZSTD_lazy; -+ } -+#endif -+#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR -+ if (cPar.strategy == ZSTD_lazy) { -+ cPar.strategy = ZSTD_greedy; -+ } -+#endif -+#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR -+ if (cPar.strategy == ZSTD_greedy) { -+ cPar.strategy = ZSTD_dfast; -+ } -+#endif -+#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR -+ if (cPar.strategy == ZSTD_dfast) { -+ cPar.strategy = ZSTD_fast; -+ cPar.targetLength = 0; -+ } -+#endif -+ - switch (mode) { - case ZSTD_cpm_unknown: - case ZSTD_cpm_noAttachDict: -@@ -1281,8 +1473,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, - } - - /* resize windowLog if input is small enough, to use less memory */ -- if ( (srcSize < maxWindowResize) -- && (dictSize < maxWindowResize) ) { -+ if ( (srcSize <= maxWindowResize) -+ && (dictSize <= maxWindowResize) ) { - U32 const tSize = (U32)(srcSize + dictSize); - static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; - U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : -@@ -1300,6 +1492,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, - if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) - cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ - -+ /* We can't use more than 32 bits of hash in total, so that means that we require: -+ * (hashLog + 8) <= 32 && (chainLog + 8) <= 32 -+ */ -+ if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) { -+ U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS; -+ if (cPar.hashLog > maxShortCacheHashLog) { -+ cPar.hashLog = maxShortCacheHashLog; -+ } -+ if (cPar.chainLog > maxShortCacheHashLog) { -+ cPar.chainLog = maxShortCacheHashLog; -+ } -+ } -+ -+ -+ /* At this point, we aren't 100% sure if we are using the row match finder. -+ * Unless it is explicitly disabled, conservatively assume that it is enabled. -+ * In this case it will only be disabled for small sources, so shrinking the -+ * hash log a little bit shouldn't result in any ratio loss. -+ */ -+ if (useRowMatchFinder == ZSTD_ps_auto) -+ useRowMatchFinder = ZSTD_ps_enable; -+ -+ /* We can't hash more than 32-bits in total. So that means that we require: -+ * (hashLog - rowLog + 8) <= 32 -+ */ -+ if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) { -+ /* Switch to 32-entry rows if searchLog is 5 (or more) */ -+ U32 const rowLog = BOUNDED(4, cPar.searchLog, 6); -+ U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS; -+ U32 const maxHashLog = maxRowHashLog + rowLog; -+ assert(cPar.hashLog >= rowLog); -+ if (cPar.hashLog > maxHashLog) { -+ cPar.hashLog = maxHashLog; -+ } -+ } -+ - return cPar; - } - -@@ -1310,7 +1538,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, - { - cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ - if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; -- return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown); -+ return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); - } - - static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); -@@ -1341,7 +1569,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( - ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); - assert(!ZSTD_checkCParams(cParams)); - /* srcSizeHint == 0 means 0 */ -- return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode); -+ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder); - } - - static size_t -@@ -1367,10 +1595,10 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, - + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) - + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) - + ZSTD_cwksp_aligned_alloc_size((1<strategy, useRowMatchFinder) -- ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) -+ ? ZSTD_cwksp_aligned_alloc_size(hSize) - : 0; - size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) - ? optPotentialSpace -@@ -1386,6 +1614,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, - return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; - } - -+/* Helper function for calculating memory requirements. -+ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ -+static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) { -+ U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4; -+ return blockSize / divider; -+} -+ - static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( - const ZSTD_compressionParameters* cParams, - const ldmParams_t* ldmParams, -@@ -1393,12 +1628,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( - const ZSTD_paramSwitch_e useRowMatchFinder, - const size_t buffInSize, - const size_t buffOutSize, -- const U64 pledgedSrcSize) -+ const U64 pledgedSrcSize, -+ int useSequenceProducer, -+ size_t maxBlockSize) - { - size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); -- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); -- U32 const divider = (cParams->minMatch==3) ? 3 : 4; -- size_t const maxNbSeq = blockSize / divider; -+ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); -+ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); - size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) - + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) - + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); -@@ -1417,6 +1653,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( - - size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; - -+ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); -+ size_t const externalSeqSpace = useSequenceProducer -+ ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) -+ : 0; -+ - size_t const neededSpace = - cctxSpace + - entropySpace + -@@ -1425,7 +1666,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( - ldmSeqSpace + - matchStateSize + - tokenSpace + -- bufferSpace; -+ bufferSpace + -+ externalSeqSpace; - - DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); - return neededSpace; -@@ -1443,7 +1685,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) - * be needed. However, we still allocate two 0-sized buffers, which can - * take space under ASAN. */ - return ZSTD_estimateCCtxSize_usingCCtxParams_internal( -- &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); -+ &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); - } - - size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) -@@ -1493,7 +1735,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) - RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); - { ZSTD_compressionParameters const cParams = - ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); -- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); -+ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog); - size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) - ? ((size_t)1 << cParams.windowLog) + blockSize - : 0; -@@ -1504,7 +1746,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) - - return ZSTD_estimateCCtxSize_usingCCtxParams_internal( - &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, -- ZSTD_CONTENTSIZE_UNKNOWN); -+ ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); - } - } - -@@ -1637,6 +1879,19 @@ typedef enum { - ZSTD_resetTarget_CCtx - } ZSTD_resetTarget_e; - -+/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ -+static U64 ZSTD_bitmix(U64 val, U64 len) { -+ val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); -+ val *= 0x9FB21C651E98DF25ULL; -+ val ^= (val >> 35) + len ; -+ val *= 0x9FB21C651E98DF25ULL; -+ return val ^ (val >> 28); -+} -+ -+/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ -+static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) { -+ ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); -+} - - static size_t - ZSTD_reset_matchState(ZSTD_matchState_t* ms, -@@ -1664,6 +1919,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, - } - - ms->hashLog3 = hashLog3; -+ ms->lazySkipping = 0; - - ZSTD_invalidateMatchState(ms); - -@@ -1685,22 +1941,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, - ZSTD_cwksp_clean_tables(ws); - } - -- /* opt parser space */ -- if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { -- DEBUGLOG(4, "reserving optimal parser space"); -- ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); -- ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); -- ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); -- ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)); -- ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); -- } -- - if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { -- { /* Row match finder needs an additional table of hashes ("tags") */ -- size_t const tagTableSize = hSize*sizeof(U16); -- ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); -- if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); -+ /* Row match finder needs an additional table of hashes ("tags") */ -+ size_t const tagTableSize = hSize; -+ /* We want to generate a new salt in case we reset a Cctx, but we always want to use -+ * 0 when we reset a Cdict */ -+ if(forWho == ZSTD_resetTarget_CCtx) { -+ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); -+ ZSTD_advanceHashSalt(ms); -+ } else { -+ /* When we are not salting we want to always memset the memory */ -+ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); -+ ZSTD_memset(ms->tagTable, 0, tagTableSize); -+ ms->hashSalt = 0; - } - { /* Switch to 32-entry rows if searchLog is 5 (or more) */ - U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); -@@ -1709,6 +1962,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, - } - } - -+ /* opt parser space */ -+ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { -+ DEBUGLOG(4, "reserving optimal parser space"); -+ ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); -+ ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); -+ ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); -+ ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); -+ ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); -+ } -+ - ms->cParams = *cParams; - - RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, -@@ -1768,6 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - assert(params->useRowMatchFinder != ZSTD_ps_auto); - assert(params->useBlockSplitter != ZSTD_ps_auto); - assert(params->ldmParams.enableLdm != ZSTD_ps_auto); -+ assert(params->maxBlockSize != 0); - if (params->ldmParams.enableLdm == ZSTD_ps_enable) { - /* Adjust long distance matching parameters */ - ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); -@@ -1776,9 +2041,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - } - - { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); -- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); -- U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; -- size_t const maxNbSeq = blockSize / divider; -+ size_t const blockSize = MIN(params->maxBlockSize, windowSize); -+ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params)); - size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) - ? ZSTD_compressBound(blockSize) + 1 - : 0; -@@ -1795,8 +2059,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - size_t const neededSpace = - ZSTD_estimateCCtxSize_usingCCtxParams_internal( - ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, -- buffInSize, buffOutSize, pledgedSrcSize); -- int resizeWorkspace; -+ buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize); - - FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); - -@@ -1805,7 +2068,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - { /* Check if workspace is large enough, alloc a new one if needed */ - int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; - int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); -- resizeWorkspace = workspaceTooSmall || workspaceWasteful; -+ int resizeWorkspace = workspaceTooSmall || workspaceWasteful; - DEBUGLOG(4, "Need %zu B workspace", neededSpace); - DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); - -@@ -1838,6 +2101,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - - /* init params */ - zc->blockState.matchState.cParams = params->cParams; -+ zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable; - zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; - zc->consumedSrcSize = 0; - zc->producedCSize = 0; -@@ -1854,13 +2118,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - - ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); - -+ FORWARD_IF_ERROR(ZSTD_reset_matchState( -+ &zc->blockState.matchState, -+ ws, -+ ¶ms->cParams, -+ params->useRowMatchFinder, -+ crp, -+ needsIndexReset, -+ ZSTD_resetTarget_CCtx), ""); -+ -+ zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); -+ -+ /* ldm hash table */ -+ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { -+ /* TODO: avoid memset? */ -+ size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; -+ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); -+ ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); -+ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); -+ zc->maxNbLdmSequences = maxNbLdmSeq; -+ -+ ZSTD_window_init(&zc->ldmState.window); -+ zc->ldmState.loadedDictEnd = 0; -+ } -+ -+ /* reserve space for block-level external sequences */ -+ if (ZSTD_hasExtSeqProd(params)) { -+ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); -+ zc->extSeqBufCapacity = maxNbExternalSeq; -+ zc->extSeqBuf = -+ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); -+ } -+ -+ /* buffers */ -+ - /* ZSTD_wildcopy() is used to copy into the literals buffer, - * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. - */ - zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); - zc->seqStore.maxNbLit = blockSize; - -- /* buffers */ - zc->bufferedPolicy = zbuff; - zc->inBuffSize = buffInSize; - zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); -@@ -1883,32 +2180,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); - zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); -- zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); -- -- FORWARD_IF_ERROR(ZSTD_reset_matchState( -- &zc->blockState.matchState, -- ws, -- ¶ms->cParams, -- params->useRowMatchFinder, -- crp, -- needsIndexReset, -- ZSTD_resetTarget_CCtx), ""); -- -- /* ldm hash table */ -- if (params->ldmParams.enableLdm == ZSTD_ps_enable) { -- /* TODO: avoid memset? */ -- size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; -- zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); -- ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); -- zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); -- zc->maxNbLdmSequences = maxNbLdmSeq; -- -- ZSTD_window_init(&zc->ldmState.window); -- zc->ldmState.loadedDictEnd = 0; -- } - - DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); -- assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); -+ assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); - - zc->initialized = 1; - -@@ -1980,7 +2254,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, - } - - params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, -- cdict->dictContentSize, ZSTD_cpm_attachDict); -+ cdict->dictContentSize, ZSTD_cpm_attachDict, -+ params.useRowMatchFinder); - params.cParams.windowLog = windowLog; - params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ - FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, -@@ -2019,6 +2294,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, - return 0; - } - -+static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize, -+ ZSTD_compressionParameters const* cParams) { -+ if (ZSTD_CDictIndicesAreTagged(cParams)){ -+ /* Remove tags from the CDict table if they are present. -+ * See docs on "short cache" in zstd_compress_internal.h for context. */ -+ size_t i; -+ for (i = 0; i < tableSize; i++) { -+ U32 const taggedIndex = src[i]; -+ U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS; -+ dst[i] = index; -+ } -+ } else { -+ ZSTD_memcpy(dst, src, tableSize * sizeof(U32)); -+ } -+} -+ - static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, - const ZSTD_CDict* cdict, - ZSTD_CCtx_params params, -@@ -2054,21 +2345,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, - : 0; - size_t const hSize = (size_t)1 << cdict_cParams->hashLog; - -- ZSTD_memcpy(cctx->blockState.matchState.hashTable, -- cdict->matchState.hashTable, -- hSize * sizeof(U32)); -+ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable, -+ cdict->matchState.hashTable, -+ hSize, cdict_cParams); -+ - /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ - if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { -- ZSTD_memcpy(cctx->blockState.matchState.chainTable, -- cdict->matchState.chainTable, -- chainSize * sizeof(U32)); -+ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable, -+ cdict->matchState.chainTable, -+ chainSize, cdict_cParams); - } - /* copy tag table */ - if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { -- size_t const tagTableSize = hSize*sizeof(U16); -+ size_t const tagTableSize = hSize; - ZSTD_memcpy(cctx->blockState.matchState.tagTable, -- cdict->matchState.tagTable, -- tagTableSize); -+ cdict->matchState.tagTable, -+ tagTableSize); -+ cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; - } - } - -@@ -2147,6 +2440,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, - params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; - params.ldmParams = srcCCtx->appliedParams.ldmParams; - params.fParams = fParams; -+ params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; - ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, - /* loadedDictSize */ 0, - ZSTDcrp_leaveDirty, zbuff); -@@ -2294,7 +2588,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par - - /* See doc/zstd_compression_format.md for detailed format description */ - --void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) -+int ZSTD_seqToCodes(const seqStore_t* seqStorePtr) - { - const seqDef* const sequences = seqStorePtr->sequencesStart; - BYTE* const llCodeTable = seqStorePtr->llCode; -@@ -2302,18 +2596,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) - BYTE* const mlCodeTable = seqStorePtr->mlCode; - U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); - U32 u; -+ int longOffsets = 0; - assert(nbSeq <= seqStorePtr->maxNbSeq); - for (u=0; u= STREAM_ACCUMULATOR_MIN)); -+ if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN) -+ longOffsets = 1; - } - if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) - llCodeTable[seqStorePtr->longLengthPos] = MaxLL; - if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) - mlCodeTable[seqStorePtr->longLengthPos] = MaxML; -+ return longOffsets; - } - - /* ZSTD_useTargetCBlockSize(): -@@ -2347,6 +2647,7 @@ typedef struct { - U32 MLtype; - size_t size; - size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ -+ int longOffsets; - } ZSTD_symbolEncodingTypeStats_t; - - /* ZSTD_buildSequencesStatistics(): -@@ -2357,11 +2658,13 @@ typedef struct { - * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) - */ - static ZSTD_symbolEncodingTypeStats_t --ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, -- const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, -- BYTE* dst, const BYTE* const dstEnd, -- ZSTD_strategy strategy, unsigned* countWorkspace, -- void* entropyWorkspace, size_t entropyWkspSize) { -+ZSTD_buildSequencesStatistics( -+ const seqStore_t* seqStorePtr, size_t nbSeq, -+ const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, -+ BYTE* dst, const BYTE* const dstEnd, -+ ZSTD_strategy strategy, unsigned* countWorkspace, -+ void* entropyWorkspace, size_t entropyWkspSize) -+{ - BYTE* const ostart = dst; - const BYTE* const oend = dstEnd; - BYTE* op = ostart; -@@ -2375,7 +2678,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, - - stats.lastCountSize = 0; - /* convert length/distances into codes */ -- ZSTD_seqToCodes(seqStorePtr); -+ stats.longOffsets = ZSTD_seqToCodes(seqStorePtr); - assert(op <= oend); - assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ - /* build CTable for Literal Lengths */ -@@ -2480,22 +2783,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, - */ - #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 - MEM_STATIC size_t --ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, -- const ZSTD_entropyCTables_t* prevEntropy, -- ZSTD_entropyCTables_t* nextEntropy, -- const ZSTD_CCtx_params* cctxParams, -- void* dst, size_t dstCapacity, -- void* entropyWorkspace, size_t entropyWkspSize, -- const int bmi2) -+ZSTD_entropyCompressSeqStore_internal( -+ const seqStore_t* seqStorePtr, -+ const ZSTD_entropyCTables_t* prevEntropy, -+ ZSTD_entropyCTables_t* nextEntropy, -+ const ZSTD_CCtx_params* cctxParams, -+ void* dst, size_t dstCapacity, -+ void* entropyWorkspace, size_t entropyWkspSize, -+ const int bmi2) - { -- const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; - ZSTD_strategy const strategy = cctxParams->cParams.strategy; - unsigned* count = (unsigned*)entropyWorkspace; - FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; - FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; - FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; - const seqDef* const sequences = seqStorePtr->sequencesStart; -- const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; -+ const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); - const BYTE* const ofCodeTable = seqStorePtr->ofCode; - const BYTE* const llCodeTable = seqStorePtr->llCode; - const BYTE* const mlCodeTable = seqStorePtr->mlCode; -@@ -2503,29 +2806,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, - BYTE* const oend = ostart + dstCapacity; - BYTE* op = ostart; - size_t lastCountSize; -+ int longOffsets = 0; - - entropyWorkspace = count + (MaxSeq + 1); - entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); - -- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq); -+ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity); - ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); - - /* Compress literals */ - { const BYTE* const literals = seqStorePtr->litStart; -- size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; -- size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; -+ size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); -+ size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); - /* Base suspicion of uncompressibility on ratio of literals to sequences */ - unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); - size_t const litSize = (size_t)(seqStorePtr->lit - literals); -+ - size_t const cSize = ZSTD_compressLiterals( -- &prevEntropy->huf, &nextEntropy->huf, -- cctxParams->cParams.strategy, -- ZSTD_literalsCompressionIsDisabled(cctxParams), - op, dstCapacity, - literals, litSize, - entropyWorkspace, entropyWkspSize, -- bmi2, suspectUncompressible); -+ &prevEntropy->huf, &nextEntropy->huf, -+ cctxParams->cParams.strategy, -+ ZSTD_literalsCompressionIsDisabled(cctxParams), -+ suspectUncompressible, bmi2); - FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); - assert(cSize <= dstCapacity); - op += cSize; -@@ -2551,11 +2856,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, - ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); - return (size_t)(op - ostart); - } -- { -- ZSTD_symbolEncodingTypeStats_t stats; -- BYTE* seqHead = op++; -+ { BYTE* const seqHead = op++; - /* build stats for sequences */ -- stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, -+ const ZSTD_symbolEncodingTypeStats_t stats = -+ ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, - &prevEntropy->fse, &nextEntropy->fse, - op, oend, - strategy, count, -@@ -2564,6 +2868,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, - *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); - lastCountSize = stats.lastCountSize; - op += stats.size; -+ longOffsets = stats.longOffsets; - } - - { size_t const bitstreamSize = ZSTD_encodeSequences( -@@ -2598,14 +2903,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, - } - - MEM_STATIC size_t --ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, -- const ZSTD_entropyCTables_t* prevEntropy, -- ZSTD_entropyCTables_t* nextEntropy, -- const ZSTD_CCtx_params* cctxParams, -- void* dst, size_t dstCapacity, -- size_t srcSize, -- void* entropyWorkspace, size_t entropyWkspSize, -- int bmi2) -+ZSTD_entropyCompressSeqStore( -+ const seqStore_t* seqStorePtr, -+ const ZSTD_entropyCTables_t* prevEntropy, -+ ZSTD_entropyCTables_t* nextEntropy, -+ const ZSTD_CCtx_params* cctxParams, -+ void* dst, size_t dstCapacity, -+ size_t srcSize, -+ void* entropyWorkspace, size_t entropyWkspSize, -+ int bmi2) - { - size_t const cSize = ZSTD_entropyCompressSeqStore_internal( - seqStorePtr, prevEntropy, nextEntropy, cctxParams, -@@ -2615,15 +2921,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, - /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. - * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. - */ -- if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) -+ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) { -+ DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); - return 0; /* block not compressed */ -+ } - FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); - - /* Check compressibility */ - { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); - if (cSize >= maxCSize) return 0; /* block not compressed */ - } -- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); -+ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); -+ /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly. -+ * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above. -+ */ -+ assert(cSize < ZSTD_BLOCKSIZE_MAX); - return cSize; - } - -@@ -2635,40 +2947,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS - static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { - { ZSTD_compressBlock_fast /* default for 0 */, - ZSTD_compressBlock_fast, -- ZSTD_compressBlock_doubleFast, -- ZSTD_compressBlock_greedy, -- ZSTD_compressBlock_lazy, -- ZSTD_compressBlock_lazy2, -- ZSTD_compressBlock_btlazy2, -- ZSTD_compressBlock_btopt, -- ZSTD_compressBlock_btultra, -- ZSTD_compressBlock_btultra2 }, -+ ZSTD_COMPRESSBLOCK_DOUBLEFAST, -+ ZSTD_COMPRESSBLOCK_GREEDY, -+ ZSTD_COMPRESSBLOCK_LAZY, -+ ZSTD_COMPRESSBLOCK_LAZY2, -+ ZSTD_COMPRESSBLOCK_BTLAZY2, -+ ZSTD_COMPRESSBLOCK_BTOPT, -+ ZSTD_COMPRESSBLOCK_BTULTRA, -+ ZSTD_COMPRESSBLOCK_BTULTRA2 -+ }, - { ZSTD_compressBlock_fast_extDict /* default for 0 */, - ZSTD_compressBlock_fast_extDict, -- ZSTD_compressBlock_doubleFast_extDict, -- ZSTD_compressBlock_greedy_extDict, -- ZSTD_compressBlock_lazy_extDict, -- ZSTD_compressBlock_lazy2_extDict, -- ZSTD_compressBlock_btlazy2_extDict, -- ZSTD_compressBlock_btopt_extDict, -- ZSTD_compressBlock_btultra_extDict, -- ZSTD_compressBlock_btultra_extDict }, -+ ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT, -+ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT, -+ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT, -+ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT, -+ ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT, -+ ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT, -+ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT, -+ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT -+ }, - { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, - ZSTD_compressBlock_fast_dictMatchState, -- ZSTD_compressBlock_doubleFast_dictMatchState, -- ZSTD_compressBlock_greedy_dictMatchState, -- ZSTD_compressBlock_lazy_dictMatchState, -- ZSTD_compressBlock_lazy2_dictMatchState, -- ZSTD_compressBlock_btlazy2_dictMatchState, -- ZSTD_compressBlock_btopt_dictMatchState, -- ZSTD_compressBlock_btultra_dictMatchState, -- ZSTD_compressBlock_btultra_dictMatchState }, -+ ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE, -+ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE, -+ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE, -+ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE, -+ ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE, -+ ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE, -+ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE, -+ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE -+ }, - { NULL /* default for 0 */, - NULL, - NULL, -- ZSTD_compressBlock_greedy_dedicatedDictSearch, -- ZSTD_compressBlock_lazy_dedicatedDictSearch, -- ZSTD_compressBlock_lazy2_dedicatedDictSearch, -+ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH, -+ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH, -+ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH, - NULL, - NULL, - NULL, -@@ -2681,18 +2996,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS - DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); - if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { - static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { -- { ZSTD_compressBlock_greedy_row, -- ZSTD_compressBlock_lazy_row, -- ZSTD_compressBlock_lazy2_row }, -- { ZSTD_compressBlock_greedy_extDict_row, -- ZSTD_compressBlock_lazy_extDict_row, -- ZSTD_compressBlock_lazy2_extDict_row }, -- { ZSTD_compressBlock_greedy_dictMatchState_row, -- ZSTD_compressBlock_lazy_dictMatchState_row, -- ZSTD_compressBlock_lazy2_dictMatchState_row }, -- { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, -- ZSTD_compressBlock_lazy_dedicatedDictSearch_row, -- ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } -+ { -+ ZSTD_COMPRESSBLOCK_GREEDY_ROW, -+ ZSTD_COMPRESSBLOCK_LAZY_ROW, -+ ZSTD_COMPRESSBLOCK_LAZY2_ROW -+ }, -+ { -+ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW, -+ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW, -+ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW -+ }, -+ { -+ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW, -+ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW, -+ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW -+ }, -+ { -+ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW, -+ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW, -+ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW -+ } - }; - DEBUGLOG(4, "Selecting a row-based matchfinder"); - assert(useRowMatchFinder != ZSTD_ps_auto); -@@ -2718,6 +3041,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) - ssPtr->longLengthType = ZSTD_llt_none; - } - -+/* ZSTD_postProcessSequenceProducerResult() : -+ * Validates and post-processes sequences obtained through the external matchfinder API: -+ * - Checks whether nbExternalSeqs represents an error condition. -+ * - Appends a block delimiter to outSeqs if one is not already present. -+ * See zstd.h for context regarding block delimiters. -+ * Returns the number of sequences after post-processing, or an error code. */ -+static size_t ZSTD_postProcessSequenceProducerResult( -+ ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize -+) { -+ RETURN_ERROR_IF( -+ nbExternalSeqs > outSeqsCapacity, -+ sequenceProducer_failed, -+ "External sequence producer returned error code %lu", -+ (unsigned long)nbExternalSeqs -+ ); -+ -+ RETURN_ERROR_IF( -+ nbExternalSeqs == 0 && srcSize > 0, -+ sequenceProducer_failed, -+ "Got zero sequences from external sequence producer for a non-empty src buffer!" -+ ); -+ -+ if (srcSize == 0) { -+ ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); -+ return 1; -+ } -+ -+ { -+ ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; -+ -+ /* We can return early if lastSeq is already a block delimiter. */ -+ if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { -+ return nbExternalSeqs; -+ } -+ -+ /* This error condition is only possible if the external matchfinder -+ * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ -+ RETURN_ERROR_IF( -+ nbExternalSeqs == outSeqsCapacity, -+ sequenceProducer_failed, -+ "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" -+ ); -+ -+ /* lastSeq is not a block delimiter, so we need to append one. */ -+ ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); -+ return nbExternalSeqs + 1; -+ } -+} -+ -+/* ZSTD_fastSequenceLengthSum() : -+ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*. -+ * Similar to another function in zstd_compress.c (determine_blockSize), -+ * except it doesn't check for a block delimiter to end summation. -+ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P). -+ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */ -+static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) { -+ size_t matchLenSum, litLenSum, i; -+ matchLenSum = 0; -+ litLenSum = 0; -+ for (i = 0; i < seqBufSize; i++) { -+ litLenSum += seqBuf[i].litLength; -+ matchLenSum += seqBuf[i].matchLength; -+ } -+ return litLenSum + matchLenSum; -+} -+ - typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; - - static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) -@@ -2727,7 +3116,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) - assert(srcSize <= ZSTD_BLOCKSIZE_MAX); - /* Assert that we have correctly flushed the ctx params into the ms's copy */ - ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); -- if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { -+ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding -+ * additional 1. We need to revisit and change this logic to be more consistent */ -+ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { - if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { - ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); - } else { -@@ -2763,6 +3154,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) - } - if (zc->externSeqStore.pos < zc->externSeqStore.size) { - assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); -+ -+ /* External matchfinder + LDM is technically possible, just not implemented yet. -+ * We need to revisit soon and implement it. */ -+ RETURN_ERROR_IF( -+ ZSTD_hasExtSeqProd(&zc->appliedParams), -+ parameter_combination_unsupported, -+ "Long-distance matching with external sequence producer enabled is not currently supported." -+ ); -+ - /* Updates ldmSeqStore.pos */ - lastLLSize = - ZSTD_ldm_blockCompress(&zc->externSeqStore, -@@ -2774,6 +3174,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) - } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { - rawSeqStore_t ldmSeqStore = kNullRawSeqStore; - -+ /* External matchfinder + LDM is technically possible, just not implemented yet. -+ * We need to revisit soon and implement it. */ -+ RETURN_ERROR_IF( -+ ZSTD_hasExtSeqProd(&zc->appliedParams), -+ parameter_combination_unsupported, -+ "Long-distance matching with external sequence producer enabled is not currently supported." -+ ); -+ - ldmSeqStore.seq = zc->ldmSequences; - ldmSeqStore.capacity = zc->maxNbLdmSequences; - /* Updates ldmSeqStore.size */ -@@ -2788,10 +3196,74 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) - zc->appliedParams.useRowMatchFinder, - src, srcSize); - assert(ldmSeqStore.pos == ldmSeqStore.size); -- } else { /* not long range mode */ -- ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, -- zc->appliedParams.useRowMatchFinder, -- dictMode); -+ } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) { -+ assert( -+ zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize) -+ ); -+ assert(zc->appliedParams.extSeqProdFunc != NULL); -+ -+ { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; -+ -+ size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)( -+ zc->appliedParams.extSeqProdState, -+ zc->extSeqBuf, -+ zc->extSeqBufCapacity, -+ src, srcSize, -+ NULL, 0, /* dict and dictSize, currently not supported */ -+ zc->appliedParams.compressionLevel, -+ windowSize -+ ); -+ -+ size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( -+ zc->extSeqBuf, -+ nbExternalSeqs, -+ zc->extSeqBufCapacity, -+ srcSize -+ ); -+ -+ /* Return early if there is no error, since we don't need to worry about last literals */ -+ if (!ZSTD_isError(nbPostProcessedSeqs)) { -+ ZSTD_sequencePosition seqPos = {0,0,0}; -+ size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs); -+ RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); -+ FORWARD_IF_ERROR( -+ ZSTD_copySequencesToSeqStoreExplicitBlockDelim( -+ zc, &seqPos, -+ zc->extSeqBuf, nbPostProcessedSeqs, -+ src, srcSize, -+ zc->appliedParams.searchForExternalRepcodes -+ ), -+ "Failed to copy external sequences to seqStore!" -+ ); -+ ms->ldmSeqStore = NULL; -+ DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs); -+ return ZSTDbss_compress; -+ } -+ -+ /* Propagate the error if fallback is disabled */ -+ if (!zc->appliedParams.enableMatchFinderFallback) { -+ return nbPostProcessedSeqs; -+ } -+ -+ /* Fallback to software matchfinder */ -+ { ZSTD_blockCompressor const blockCompressor = -+ ZSTD_selectBlockCompressor( -+ zc->appliedParams.cParams.strategy, -+ zc->appliedParams.useRowMatchFinder, -+ dictMode); -+ ms->ldmSeqStore = NULL; -+ DEBUGLOG( -+ 5, -+ "External sequence producer returned error code %lu. Falling back to internal parser.", -+ (unsigned long)nbExternalSeqs -+ ); -+ lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); -+ } } -+ } else { /* not long range mode and no external matchfinder */ -+ ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor( -+ zc->appliedParams.cParams.strategy, -+ zc->appliedParams.useRowMatchFinder, -+ dictMode); - ms->ldmSeqStore = NULL; - lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); - } -@@ -2801,29 +3273,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) - return ZSTDbss_compress; - } - --static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) -+static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) - { -- const seqStore_t* seqStore = ZSTD_getSeqStore(zc); -- const seqDef* seqStoreSeqs = seqStore->sequencesStart; -- size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs; -- size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart); -- size_t literalsRead = 0; -- size_t lastLLSize; -+ const seqDef* inSeqs = seqStore->sequencesStart; -+ const size_t nbInSequences = seqStore->sequences - inSeqs; -+ const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); - -- ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; -+ ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; -+ const size_t nbOutSequences = nbInSequences + 1; -+ size_t nbOutLiterals = 0; -+ repcodes_t repcodes; - size_t i; -- repcodes_t updatedRepcodes; - -- assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); -- /* Ensure we have enough space for last literals "sequence" */ -- assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1); -- ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); -- for (i = 0; i < seqStoreSeqSize; ++i) { -- U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM; -- outSeqs[i].litLength = seqStoreSeqs[i].litLength; -- outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH; -+ /* Bounds check that we have enough space for every input sequence -+ * and the block delimiter -+ */ -+ assert(seqCollector->seqIndex <= seqCollector->maxSequences); -+ RETURN_ERROR_IF( -+ nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex), -+ dstSize_tooSmall, -+ "Not enough space to copy sequences"); -+ -+ ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes)); -+ for (i = 0; i < nbInSequences; ++i) { -+ U32 rawOffset; -+ outSeqs[i].litLength = inSeqs[i].litLength; -+ outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH; - outSeqs[i].rep = 0; - -+ /* Handle the possible single length >= 64K -+ * There can only be one because we add MINMATCH to every match length, -+ * and blocks are at most 128K. -+ */ - if (i == seqStore->longLengthPos) { - if (seqStore->longLengthType == ZSTD_llt_literalLength) { - outSeqs[i].litLength += 0x10000; -@@ -2832,37 +3313,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) - } - } - -- if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) { -- /* Derive the correct offset corresponding to a repcode */ -- outSeqs[i].rep = seqStoreSeqs[i].offBase; -+ /* Determine the raw offset given the offBase, which may be a repcode. */ -+ if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) { -+ const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase); -+ assert(repcode > 0); -+ outSeqs[i].rep = repcode; - if (outSeqs[i].litLength != 0) { -- rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1]; -+ rawOffset = repcodes.rep[repcode - 1]; - } else { -- if (outSeqs[i].rep == 3) { -- rawOffset = updatedRepcodes.rep[0] - 1; -+ if (repcode == 3) { -+ assert(repcodes.rep[0] > 1); -+ rawOffset = repcodes.rep[0] - 1; - } else { -- rawOffset = updatedRepcodes.rep[outSeqs[i].rep]; -+ rawOffset = repcodes.rep[repcode]; - } - } -+ } else { -+ rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase); - } - outSeqs[i].offset = rawOffset; -- /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode -- so we provide seqStoreSeqs[i].offset - 1 */ -- ZSTD_updateRep(updatedRepcodes.rep, -- seqStoreSeqs[i].offBase - 1, -- seqStoreSeqs[i].litLength == 0); -- literalsRead += outSeqs[i].litLength; -+ -+ /* Update repcode history for the sequence */ -+ ZSTD_updateRep(repcodes.rep, -+ inSeqs[i].offBase, -+ inSeqs[i].litLength == 0); -+ -+ nbOutLiterals += outSeqs[i].litLength; - } - /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. - * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker - * for the block boundary, according to the API. - */ -- assert(seqStoreLiteralsSize >= literalsRead); -- lastLLSize = seqStoreLiteralsSize - literalsRead; -- outSeqs[i].litLength = (U32)lastLLSize; -- outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0; -- seqStoreSeqSize++; -- zc->seqCollector.seqIndex += seqStoreSeqSize; -+ assert(nbInLiterals >= nbOutLiterals); -+ { -+ const size_t lastLLSize = nbInLiterals - nbOutLiterals; -+ outSeqs[nbInSequences].litLength = (U32)lastLLSize; -+ outSeqs[nbInSequences].matchLength = 0; -+ outSeqs[nbInSequences].offset = 0; -+ assert(nbOutSequences == nbInSequences + 1); -+ } -+ seqCollector->seqIndex += nbOutSequences; -+ assert(seqCollector->seqIndex <= seqCollector->maxSequences); -+ -+ return 0; -+} -+ -+size_t ZSTD_sequenceBound(size_t srcSize) { -+ const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1; -+ const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1; -+ return maxNbSeq + maxNbDelims; - } - - size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, -@@ -2871,6 +3370,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, - const size_t dstCapacity = ZSTD_compressBound(srcSize); - void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); - SeqCollector seqCollector; -+ { -+ int targetCBlockSize; -+ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), ""); -+ RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0"); -+ } -+ { -+ int nbWorkers; -+ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), ""); -+ RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); -+ } - - RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); - -@@ -2880,8 +3389,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, - seqCollector.maxSequences = outSeqsSize; - zc->seqCollector = seqCollector; - -- ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); -- ZSTD_customFree(dst, ZSTD_defaultCMem); -+ { -+ const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); -+ ZSTD_customFree(dst, ZSTD_defaultCMem); -+ FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed"); -+ } -+ assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize)); - return zc->seqCollector.seqIndex; - } - -@@ -2910,19 +3423,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { - const size_t unrollMask = unrollSize - 1; - const size_t prefixLength = length & unrollMask; - size_t i; -- size_t u; - if (length == 1) return 1; - /* Check if prefix is RLE first before using unrolled loop */ - if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) { - return 0; - } - for (i = prefixLength; i != length; i += unrollSize) { -+ size_t u; - for (u = 0; u < unrollSize; u += sizeof(size_t)) { - if (MEM_readST(ip + i + u) != valueST) { - return 0; -- } -- } -- } -+ } } } - return 1; - } - -@@ -2938,7 +3449,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) - return nbSeqs < 4 && nbLits < 10; - } - --static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) -+static void -+ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) - { - ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; - bs->prevCBlock = bs->nextCBlock; -@@ -2946,7 +3458,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c - } - - /* Writes the block header */ --static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { -+static void -+writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) -+{ - U32 const cBlockHeader = cSize == 1 ? - lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : - lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); -@@ -2959,13 +3473,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB - * Stores literals block type (raw, rle, compressed, repeat) and - * huffman description table to hufMetadata. - * Requires ENTROPY_WORKSPACE_SIZE workspace -- * @return : size of huffman description table or error code */ --static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, -- const ZSTD_hufCTables_t* prevHuf, -- ZSTD_hufCTables_t* nextHuf, -- ZSTD_hufCTablesMetadata_t* hufMetadata, -- const int literalsCompressionIsDisabled, -- void* workspace, size_t wkspSize) -+ * @return : size of huffman description table, or an error code -+ */ -+static size_t -+ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, -+ const ZSTD_hufCTables_t* prevHuf, -+ ZSTD_hufCTables_t* nextHuf, -+ ZSTD_hufCTablesMetadata_t* hufMetadata, -+ const int literalsCompressionIsDisabled, -+ void* workspace, size_t wkspSize, -+ int hufFlags) - { - BYTE* const wkspStart = (BYTE*)workspace; - BYTE* const wkspEnd = wkspStart + wkspSize; -@@ -2973,9 +3490,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi - unsigned* const countWksp = (unsigned*)workspace; - const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); - BYTE* const nodeWksp = countWkspStart + countWkspSize; -- const size_t nodeWkspSize = wkspEnd-nodeWksp; -+ const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp); - unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; -- unsigned huffLog = HUF_TABLELOG_DEFAULT; -+ unsigned huffLog = LitHufLog; - HUF_repeat repeat = prevHuf->repeatMode; - DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); - -@@ -2990,73 +3507,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi - - /* small ? don't even attempt compression (speed opt) */ - #ifndef COMPRESS_LITERALS_SIZE_MIN --#define COMPRESS_LITERALS_SIZE_MIN 63 -+# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */ - #endif - { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; - if (srcSize <= minLitSize) { - DEBUGLOG(5, "set_basic - too small"); - hufMetadata->hType = set_basic; - return 0; -- } -- } -+ } } - - /* Scan input and build symbol stats */ -- { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); -+ { size_t const largest = -+ HIST_count_wksp (countWksp, &maxSymbolValue, -+ (const BYTE*)src, srcSize, -+ workspace, wkspSize); - FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); - if (largest == srcSize) { -+ /* only one literal symbol */ - DEBUGLOG(5, "set_rle"); - hufMetadata->hType = set_rle; - return 0; - } - if (largest <= (srcSize >> 7)+4) { -+ /* heuristic: likely not compressible */ - DEBUGLOG(5, "set_basic - no gain"); - hufMetadata->hType = set_basic; - return 0; -- } -- } -+ } } - - /* Validate the previous Huffman table */ -- if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { -+ if (repeat == HUF_repeat_check -+ && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { - repeat = HUF_repeat_none; - } - - /* Build Huffman Tree */ - ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); -- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); -+ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags); -+ assert(huffLog <= LitHufLog); - { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, - maxSymbolValue, huffLog, - nodeWksp, nodeWkspSize); - FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); - huffLog = (U32)maxBits; -- { /* Build and write the CTable */ -- size_t const newCSize = HUF_estimateCompressedSize( -- (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); -- size_t const hSize = HUF_writeCTable_wksp( -- hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), -- (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, -- nodeWksp, nodeWkspSize); -- /* Check against repeating the previous CTable */ -- if (repeat != HUF_repeat_none) { -- size_t const oldCSize = HUF_estimateCompressedSize( -- (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); -- if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { -- DEBUGLOG(5, "set_repeat - smaller"); -- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); -- hufMetadata->hType = set_repeat; -- return 0; -- } -- } -- if (newCSize + hSize >= srcSize) { -- DEBUGLOG(5, "set_basic - no gains"); -+ } -+ { /* Build and write the CTable */ -+ size_t const newCSize = HUF_estimateCompressedSize( -+ (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); -+ size_t const hSize = HUF_writeCTable_wksp( -+ hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), -+ (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, -+ nodeWksp, nodeWkspSize); -+ /* Check against repeating the previous CTable */ -+ if (repeat != HUF_repeat_none) { -+ size_t const oldCSize = HUF_estimateCompressedSize( -+ (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); -+ if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { -+ DEBUGLOG(5, "set_repeat - smaller"); - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); -- hufMetadata->hType = set_basic; -+ hufMetadata->hType = set_repeat; - return 0; -- } -- DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); -- hufMetadata->hType = set_compressed; -- nextHuf->repeatMode = HUF_repeat_check; -- return hSize; -+ } } -+ if (newCSize + hSize >= srcSize) { -+ DEBUGLOG(5, "set_basic - no gains"); -+ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); -+ hufMetadata->hType = set_basic; -+ return 0; - } -+ DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); -+ hufMetadata->hType = set_compressed; -+ nextHuf->repeatMode = HUF_repeat_check; -+ return hSize; - } - } - -@@ -3066,8 +3587,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi - * and updates nextEntropy to the appropriate repeatMode. - */ - static ZSTD_symbolEncodingTypeStats_t --ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { -- ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; -+ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) -+{ -+ ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0}; - nextEntropy->litlength_repeatMode = FSE_repeat_none; - nextEntropy->offcode_repeatMode = FSE_repeat_none; - nextEntropy->matchlength_repeatMode = FSE_repeat_none; -@@ -3078,16 +3600,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { - * Builds entropy for the sequences. - * Stores symbol compression modes and fse table to fseMetadata. - * Requires ENTROPY_WORKSPACE_SIZE wksp. -- * @return : size of fse tables or error code */ --static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, -- const ZSTD_fseCTables_t* prevEntropy, -- ZSTD_fseCTables_t* nextEntropy, -- const ZSTD_CCtx_params* cctxParams, -- ZSTD_fseCTablesMetadata_t* fseMetadata, -- void* workspace, size_t wkspSize) -+ * @return : size of fse tables or error code */ -+static size_t -+ZSTD_buildBlockEntropyStats_sequences( -+ const seqStore_t* seqStorePtr, -+ const ZSTD_fseCTables_t* prevEntropy, -+ ZSTD_fseCTables_t* nextEntropy, -+ const ZSTD_CCtx_params* cctxParams, -+ ZSTD_fseCTablesMetadata_t* fseMetadata, -+ void* workspace, size_t wkspSize) - { - ZSTD_strategy const strategy = cctxParams->cParams.strategy; -- size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; -+ size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); - BYTE* const ostart = fseMetadata->fseTablesBuffer; - BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); - BYTE* op = ostart; -@@ -3114,23 +3638,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, - /* ZSTD_buildBlockEntropyStats() : - * Builds entropy for the block. - * Requires workspace size ENTROPY_WORKSPACE_SIZE -- * -- * @return : 0 on success or error code -+ * @return : 0 on success, or an error code -+ * Note : also employed in superblock - */ --size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, -- const ZSTD_entropyCTables_t* prevEntropy, -- ZSTD_entropyCTables_t* nextEntropy, -- const ZSTD_CCtx_params* cctxParams, -- ZSTD_entropyCTablesMetadata_t* entropyMetadata, -- void* workspace, size_t wkspSize) --{ -- size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; -+size_t ZSTD_buildBlockEntropyStats( -+ const seqStore_t* seqStorePtr, -+ const ZSTD_entropyCTables_t* prevEntropy, -+ ZSTD_entropyCTables_t* nextEntropy, -+ const ZSTD_CCtx_params* cctxParams, -+ ZSTD_entropyCTablesMetadata_t* entropyMetadata, -+ void* workspace, size_t wkspSize) -+{ -+ size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); -+ int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD); -+ int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0; -+ - entropyMetadata->hufMetadata.hufDesSize = - ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, - &prevEntropy->huf, &nextEntropy->huf, - &entropyMetadata->hufMetadata, - ZSTD_literalsCompressionIsDisabled(cctxParams), -- workspace, wkspSize); -+ workspace, wkspSize, hufFlags); -+ - FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); - entropyMetadata->fseMetadata.fseTablesSize = - ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, -@@ -3143,11 +3672,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, - } - - /* Returns the size estimate for the literals section (header + content) of a block */ --static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, -- const ZSTD_hufCTables_t* huf, -- const ZSTD_hufCTablesMetadata_t* hufMetadata, -- void* workspace, size_t wkspSize, -- int writeEntropy) -+static size_t -+ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, -+ const ZSTD_hufCTables_t* huf, -+ const ZSTD_hufCTablesMetadata_t* hufMetadata, -+ void* workspace, size_t wkspSize, -+ int writeEntropy) - { - unsigned* const countWksp = (unsigned*)workspace; - unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; -@@ -3169,12 +3699,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz - } - - /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ --static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, -- const BYTE* codeTable, size_t nbSeq, unsigned maxCode, -- const FSE_CTable* fseCTable, -- const U8* additionalBits, -- short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, -- void* workspace, size_t wkspSize) -+static size_t -+ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, -+ const BYTE* codeTable, size_t nbSeq, unsigned maxCode, -+ const FSE_CTable* fseCTable, -+ const U8* additionalBits, -+ short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, -+ void* workspace, size_t wkspSize) - { - unsigned* const countWksp = (unsigned*)workspace; - const BYTE* ctp = codeTable; -@@ -3206,99 +3737,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, - } - - /* Returns the size estimate for the sequences section (header + content) of a block */ --static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, -- const BYTE* llCodeTable, -- const BYTE* mlCodeTable, -- size_t nbSeq, -- const ZSTD_fseCTables_t* fseTables, -- const ZSTD_fseCTablesMetadata_t* fseMetadata, -- void* workspace, size_t wkspSize, -- int writeEntropy) -+static size_t -+ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, -+ const BYTE* llCodeTable, -+ const BYTE* mlCodeTable, -+ size_t nbSeq, -+ const ZSTD_fseCTables_t* fseTables, -+ const ZSTD_fseCTablesMetadata_t* fseMetadata, -+ void* workspace, size_t wkspSize, -+ int writeEntropy) - { - size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); - size_t cSeqSizeEstimate = 0; - cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, -- fseTables->offcodeCTable, NULL, -- OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, -- workspace, wkspSize); -+ fseTables->offcodeCTable, NULL, -+ OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, -+ workspace, wkspSize); - cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, -- fseTables->litlengthCTable, LL_bits, -- LL_defaultNorm, LL_defaultNormLog, MaxLL, -- workspace, wkspSize); -+ fseTables->litlengthCTable, LL_bits, -+ LL_defaultNorm, LL_defaultNormLog, MaxLL, -+ workspace, wkspSize); - cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, -- fseTables->matchlengthCTable, ML_bits, -- ML_defaultNorm, ML_defaultNormLog, MaxML, -- workspace, wkspSize); -+ fseTables->matchlengthCTable, ML_bits, -+ ML_defaultNorm, ML_defaultNormLog, MaxML, -+ workspace, wkspSize); - if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; - return cSeqSizeEstimate + sequencesSectionHeaderSize; - } - - /* Returns the size estimate for a given stream of literals, of, ll, ml */ --static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, -- const BYTE* ofCodeTable, -- const BYTE* llCodeTable, -- const BYTE* mlCodeTable, -- size_t nbSeq, -- const ZSTD_entropyCTables_t* entropy, -- const ZSTD_entropyCTablesMetadata_t* entropyMetadata, -- void* workspace, size_t wkspSize, -- int writeLitEntropy, int writeSeqEntropy) { -+static size_t -+ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, -+ const BYTE* ofCodeTable, -+ const BYTE* llCodeTable, -+ const BYTE* mlCodeTable, -+ size_t nbSeq, -+ const ZSTD_entropyCTables_t* entropy, -+ const ZSTD_entropyCTablesMetadata_t* entropyMetadata, -+ void* workspace, size_t wkspSize, -+ int writeLitEntropy, int writeSeqEntropy) -+{ - size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, -- &entropy->huf, &entropyMetadata->hufMetadata, -- workspace, wkspSize, writeLitEntropy); -+ &entropy->huf, &entropyMetadata->hufMetadata, -+ workspace, wkspSize, writeLitEntropy); - size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, -- nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, -- workspace, wkspSize, writeSeqEntropy); -+ nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, -+ workspace, wkspSize, writeSeqEntropy); - return seqSize + literalsSize + ZSTD_blockHeaderSize; - } - - /* Builds entropy statistics and uses them for blocksize estimation. - * -- * Returns the estimated compressed size of the seqStore, or a zstd error. -+ * @return: estimated compressed size of the seqStore, or a zstd error. - */ --static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { -- ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; -+static size_t -+ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) -+{ -+ ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; - DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); - FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, - &zc->blockState.prevCBlock->entropy, - &zc->blockState.nextCBlock->entropy, - &zc->appliedParams, - entropyMetadata, -- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); -- return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), -+ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), ""); -+ return ZSTD_estimateBlockSize( -+ seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), - seqStore->ofCode, seqStore->llCode, seqStore->mlCode, - (size_t)(seqStore->sequences - seqStore->sequencesStart), -- &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, -+ &zc->blockState.nextCBlock->entropy, -+ entropyMetadata, -+ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, - (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); - } - - /* Returns literals bytes represented in a seqStore */ --static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { -+static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) -+{ - size_t literalsBytes = 0; -- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; -+ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); - size_t i; - for (i = 0; i < nbSeqs; ++i) { -- seqDef seq = seqStore->sequencesStart[i]; -+ seqDef const seq = seqStore->sequencesStart[i]; - literalsBytes += seq.litLength; - if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { - literalsBytes += 0x10000; -- } -- } -+ } } - return literalsBytes; - } - - /* Returns match bytes represented in a seqStore */ --static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { -+static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) -+{ - size_t matchBytes = 0; -- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; -+ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); - size_t i; - for (i = 0; i < nbSeqs; ++i) { - seqDef seq = seqStore->sequencesStart[i]; - matchBytes += seq.mlBase + MINMATCH; - if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { - matchBytes += 0x10000; -- } -- } -+ } } - return matchBytes; - } - -@@ -3307,15 +3846,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { - */ - static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, - const seqStore_t* originalSeqStore, -- size_t startIdx, size_t endIdx) { -- BYTE* const litEnd = originalSeqStore->lit; -- size_t literalsBytes; -- size_t literalsBytesPreceding = 0; -- -+ size_t startIdx, size_t endIdx) -+{ - *resultSeqStore = *originalSeqStore; - if (startIdx > 0) { - resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; -- literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); -+ resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); - } - - /* Move longLengthPos into the correct position if necessary */ -@@ -3328,13 +3864,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, - } - resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; - resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; -- literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); -- resultSeqStore->litStart += literalsBytesPreceding; - if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { - /* This accounts for possible last literals if the derived chunk reaches the end of the block */ -- resultSeqStore->lit = litEnd; -+ assert(resultSeqStore->lit == originalSeqStore->lit); - } else { -- resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; -+ size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); -+ resultSeqStore->lit = resultSeqStore->litStart + literalsBytes; - } - resultSeqStore->llCode += startIdx; - resultSeqStore->mlCode += startIdx; -@@ -3342,20 +3877,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, - } - - /* -- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. -- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq(). -+ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history. -+ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq(). - */ - static U32 --ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) --{ -- U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */ -- assert(STORED_IS_REPCODE(offCode)); -- if (adjustedOffCode == ZSTD_REP_NUM) { -- /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ -- assert(rep[0] > 0); -+ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) -+{ -+ U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */ -+ assert(OFFBASE_IS_REPCODE(offBase)); -+ if (adjustedRepCode == ZSTD_REP_NUM) { -+ assert(ll0); -+ /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 -+ * This is only valid if it results in a valid offset value, aka > 0. -+ * Note : it may happen that `rep[0]==1` in exceptional circumstances. -+ * In which case this function will return 0, which is an invalid offset. -+ * It's not an issue though, since this value will be -+ * compared and discarded within ZSTD_seqStore_resolveOffCodes(). -+ */ - return rep[0] - 1; - } -- return rep[adjustedOffCode]; -+ return rep[adjustedRepCode]; - } - - /* -@@ -3371,30 +3912,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c - * 1-3 : repcode 1-3 - * 4+ : real_offset+3 - */ --static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, -- seqStore_t* const seqStore, U32 const nbSeq) { -+static void -+ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, -+ const seqStore_t* const seqStore, U32 const nbSeq) -+{ - U32 idx = 0; -+ U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; - for (; idx < nbSeq; ++idx) { - seqDef* const seq = seqStore->sequencesStart + idx; -- U32 const ll0 = (seq->litLength == 0); -- U32 const offCode = OFFBASE_TO_STORED(seq->offBase); -- assert(seq->offBase > 0); -- if (STORED_IS_REPCODE(offCode)) { -- U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); -- U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); -+ U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); -+ U32 const offBase = seq->offBase; -+ assert(offBase > 0); -+ if (OFFBASE_IS_REPCODE(offBase)) { -+ U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0); -+ U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0); - /* Adjust simulated decompression repcode history if we come across a mismatch. Replace - * the repcode with the offset it actually references, determined by the compression - * repcode history. - */ - if (dRawOffset != cRawOffset) { -- seq->offBase = cRawOffset + ZSTD_REP_NUM; -+ seq->offBase = OFFSET_TO_OFFBASE(cRawOffset); - } - } - /* Compression repcode history is always updated with values directly from the unmodified seqStore. - * Decompression repcode history may use modified seq->offset value taken from compression repcode history. - */ -- ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0); -- ZSTD_updateRep(cRepcodes->rep, offCode, ll0); -+ ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0); -+ ZSTD_updateRep(cRepcodes->rep, offBase, ll0); - } - } - -@@ -3404,10 +3948,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_ - * Returns the total size of that block (including header) or a ZSTD error code. - */ - static size_t --ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, -+ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, -+ const seqStore_t* const seqStore, - repcodes_t* const dRep, repcodes_t* const cRep, - void* dst, size_t dstCapacity, -- const void* src, size_t srcSize, -+ const void* src, size_t srcSize, - U32 lastBlock, U32 isPartition) - { - const U32 rleMaxLength = 25; -@@ -3442,8 +3987,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, - cSeqsSize = 1; - } - -+ /* Sequence collection not supported when block splitting */ - if (zc->seqCollector.collectSequences) { -- ZSTD_copyBlockSequences(zc); -+ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed"); - ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); - return 0; - } -@@ -3481,45 +4027,49 @@ typedef struct { - - /* Helper function to perform the recursive search for block splits. - * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. -- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then -- * we do not recurse. -+ * If advantageous to split, then we recurse down the two sub-blocks. -+ * If not, or if an error occurred in estimation, then we do not recurse. - * -- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. -+ * Note: The recursion depth is capped by a heuristic minimum number of sequences, -+ * defined by MIN_SEQUENCES_BLOCK_SPLITTING. - * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). - * In practice, recursion depth usually doesn't go beyond 4. - * -- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize -+ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. -+ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize - * maximum of 128 KB, this value is actually impossible to reach. - */ - static void - ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, - ZSTD_CCtx* zc, const seqStore_t* origSeqStore) - { -- seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; -- seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; -- seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; -+ seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; -+ seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; -+ seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; - size_t estimatedOriginalSize; - size_t estimatedFirstHalfSize; - size_t estimatedSecondHalfSize; - size_t midIdx = (startIdx + endIdx)/2; - -+ DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); -+ assert(endIdx >= startIdx); - if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { -- DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); -+ DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx); - return; - } -- DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); - ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); - ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); - ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); - estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); - estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); - estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); -- DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", -+ DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", - estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); - if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { - return; - } - if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { -+ DEBUGLOG(5, "split decided at seqNb:%zu", midIdx); - ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); - splits->splitLocations[splits->idx] = (U32)midIdx; - splits->idx++; -@@ -3527,14 +4077,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end - } - } - --/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. -+/* Base recursive function. -+ * Populates a table with intra-block partition indices that can improve compression ratio. - * -- * Returns the number of splits made (which equals the size of the partition table - 1). -+ * @return: number of splits made (which equals the size of the partition table - 1). - */ --static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { -- seqStoreSplits splits = {partitions, 0}; -+static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) -+{ -+ seqStoreSplits splits; -+ splits.splitLocations = partitions; -+ splits.idx = 0; - if (nbSeq <= 4) { -- DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); -+ DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq); - /* Refuse to try and split anything with less than 4 sequences */ - return 0; - } -@@ -3550,18 +4104,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) - * Returns combined size of all blocks (which includes headers), or a ZSTD error code. - */ - static size_t --ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, -- const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) -+ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, -+ void* dst, size_t dstCapacity, -+ const void* src, size_t blockSize, -+ U32 lastBlock, U32 nbSeq) - { - size_t cSize = 0; - const BYTE* ip = (const BYTE*)src; - BYTE* op = (BYTE*)dst; - size_t i = 0; - size_t srcBytesTotal = 0; -- U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ -- seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore; -- seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; -- size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); -+ U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ -+ seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; -+ seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; -+ size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); - - /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history - * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two -@@ -3583,30 +4139,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac - ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); - ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); - -- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", -+ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", - (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, - (unsigned)zc->blockState.matchState.nextToUpdate); - - if (numSplits == 0) { -- size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, -- &dRep, &cRep, -- op, dstCapacity, -- ip, blockSize, -- lastBlock, 0 /* isPartition */); -+ size_t cSizeSingleBlock = -+ ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, -+ &dRep, &cRep, -+ op, dstCapacity, -+ ip, blockSize, -+ lastBlock, 0 /* isPartition */); - FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); - DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); -- assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); -+ assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX); -+ assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize); - return cSizeSingleBlock; - } - - ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); - for (i = 0; i <= numSplits; ++i) { -- size_t srcBytes; - size_t cSizeChunk; - U32 const lastPartition = (i == numSplits); - U32 lastBlockEntireSrc = 0; - -- srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); -+ size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); - srcBytesTotal += srcBytes; - if (lastPartition) { - /* This is the final partition, need to account for possible last literals */ -@@ -3621,7 +4178,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac - op, dstCapacity, - ip, srcBytes, - lastBlockEntireSrc, 1 /* isPartition */); -- DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); -+ DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size", -+ ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); - FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); - - ip += srcBytes; -@@ -3629,10 +4187,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac - dstCapacity -= cSizeChunk; - cSize += cSizeChunk; - *currSeqStore = *nextSeqStore; -- assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); -+ assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize); - } -- /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes -- * for the next block. -+ /* cRep and dRep may have diverged during the compression. -+ * If so, we use the dRep repcodes for the next block. - */ - ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); - return cSize; -@@ -3643,8 +4201,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, U32 lastBlock) - { -- const BYTE* ip = (const BYTE*)src; -- BYTE* op = (BYTE*)dst; - U32 nbSeq; - size_t cSize; - DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); -@@ -3655,7 +4211,8 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, - if (bss == ZSTDbss_noCompress) { - if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) - zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; -- cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); -+ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); -+ cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); - FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); - DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); - return cSize; -@@ -3673,9 +4230,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, U32 frame) - { -- /* This the upper bound for the length of an rle block. -- * This isn't the actual upper bound. Finding the real threshold -- * needs further investigation. -+ /* This is an estimated upper bound for the length of an rle block. -+ * This isn't the actual upper bound. -+ * Finding the real threshold needs further investigation. - */ - const U32 rleMaxLength = 25; - size_t cSize; -@@ -3687,11 +4244,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, - - { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); - FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); -- if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; } -+ if (bss == ZSTDbss_noCompress) { -+ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); -+ cSize = 0; -+ goto out; -+ } - } - - if (zc->seqCollector.collectSequences) { -- ZSTD_copyBlockSequences(zc); -+ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed"); - ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); - return 0; - } -@@ -3767,10 +4328,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, - * * cSize >= blockBound(srcSize): We have expanded the block too much so - * emit an uncompressed block. - */ -- { -- size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); -+ { size_t const cSize = -+ ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); - if (cSize != ERROR(dstSize_tooSmall)) { -- size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); -+ size_t const maxCSize = -+ srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); - FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); - if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { - ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); -@@ -3778,7 +4340,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, - } - } - } -- } -+ } /* if (bss == ZSTDbss_compress)*/ - - DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); - /* Superblock compression failed, attempt to emit a single no compress block. -@@ -3836,7 +4398,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, - * All blocks will be terminated, all input will be consumed. - * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. - * Frame is supposed already started (header already produced) --* @return : compressed size, or an error code -+* @return : compressed size, or an error code - */ - static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, - void* dst, size_t dstCapacity, -@@ -3860,7 +4422,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, - ZSTD_matchState_t* const ms = &cctx->blockState.matchState; - U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); - -- RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, -+ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding -+ * additional 1. We need to revisit and change this logic to be more consistent */ -+ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, - dstSize_tooSmall, - "not enough space to store compressed block"); - if (remaining < blockSize) blockSize = remaining; -@@ -3899,7 +4463,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, - MEM_writeLE24(op, cBlockHeader); - cSize += ZSTD_blockHeaderSize; - } -- } -+ } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ - - - ip += blockSize; -@@ -4001,19 +4565,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity) - } - } - --size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) -+void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) - { -- RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong, -- "wrong cctx stage"); -- RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable, -- parameter_unsupported, -- "incompatible with ldm"); -+ assert(cctx->stage == ZSTDcs_init); -+ assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable); - cctx->externSeqStore.seq = seq; - cctx->externSeqStore.size = nbSeq; - cctx->externSeqStore.capacity = nbSeq; - cctx->externSeqStore.pos = 0; - cctx->externSeqStore.posInSequence = 0; -- return 0; - } - - -@@ -4078,31 +4638,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, - } - } - --size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, -- void* dst, size_t dstCapacity, -- const void* src, size_t srcSize) -+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, -+ void* dst, size_t dstCapacity, -+ const void* src, size_t srcSize) - { - DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); - return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); - } - -+/* NOTE: Must just wrap ZSTD_compressContinue_public() */ -+size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, -+ void* dst, size_t dstCapacity, -+ const void* src, size_t srcSize) -+{ -+ return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize); -+} - --size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) -+static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx) - { - ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; - assert(!ZSTD_checkCParams(cParams)); -- return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); -+ return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); - } - --size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) -+/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */ -+size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) -+{ -+ return ZSTD_getBlockSize_deprecated(cctx); -+} -+ -+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ -+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) - { - DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); -- { size_t const blockSizeMax = ZSTD_getBlockSize(cctx); -+ { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx); - RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } - - return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); - } - -+/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ -+size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) -+{ -+ return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize); -+} -+ - /*! ZSTD_loadDictionaryContent() : - * @return : 0, or an error code - */ -@@ -4111,25 +4691,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, - ZSTD_cwksp* ws, - ZSTD_CCtx_params const* params, - const void* src, size_t srcSize, -- ZSTD_dictTableLoadMethod_e dtlm) -+ ZSTD_dictTableLoadMethod_e dtlm, -+ ZSTD_tableFillPurpose_e tfp) - { - const BYTE* ip = (const BYTE*) src; - const BYTE* const iend = ip + srcSize; - int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; - -- /* Assert that we the ms params match the params we're being given */ -+ /* Assert that the ms params match the params we're being given */ - ZSTD_assertEqualCParams(params->cParams, ms->cParams); - -- if (srcSize > ZSTD_CHUNKSIZE_MAX) { -+ { /* Ensure large dictionaries can't cause index overflow */ -+ - /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. - * Dictionaries right at the edge will immediately trigger overflow - * correction, but I don't want to insert extra constraints here. - */ -- U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; -- /* We must have cleared our windows when our source is this large. */ -- assert(ZSTD_window_isEmpty(ms->window)); -- if (loadLdmDict) -- assert(ZSTD_window_isEmpty(ls->window)); -+ U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX; -+ -+ int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams); -+ if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) { -+ /* Some dictionary matchfinders in zstd use "short cache", -+ * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each -+ * CDict hashtable entry as a tag rather than as part of an index. -+ * When short cache is used, we need to truncate the dictionary -+ * so that its indices don't overlap with the tag. */ -+ U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX; -+ maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize); -+ assert(!loadLdmDict); -+ } -+ - /* If the dictionary is too large, only load the suffix of the dictionary. */ - if (srcSize > maxDictSize) { - ip = iend - maxDictSize; -@@ -4138,35 +4729,58 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, - } - } - -- DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); -+ if (srcSize > ZSTD_CHUNKSIZE_MAX) { -+ /* We must have cleared our windows when our source is this large. */ -+ assert(ZSTD_window_isEmpty(ms->window)); -+ if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); -+ } - ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); -- ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); -- ms->forceNonContiguous = params->deterministicRefPrefix; - -- if (loadLdmDict) { -+ DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); -+ -+ if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ - ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); - ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); -+ ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); - } - -+ /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ -+ if (params->cParams.strategy < ZSTD_btultra) { -+ U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28); -+ if (srcSize > maxDictSize) { -+ ip = iend - maxDictSize; -+ src = ip; -+ srcSize = maxDictSize; -+ } -+ } -+ -+ ms->nextToUpdate = (U32)(ip - ms->window.base); -+ ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); -+ ms->forceNonContiguous = params->deterministicRefPrefix; -+ - if (srcSize <= HASH_READ_SIZE) return 0; - - ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); - -- if (loadLdmDict) -- ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); -- - switch(params->cParams.strategy) - { - case ZSTD_fast: -- ZSTD_fillHashTable(ms, iend, dtlm); -+ ZSTD_fillHashTable(ms, iend, dtlm, tfp); - break; - case ZSTD_dfast: -- ZSTD_fillDoubleHashTable(ms, iend, dtlm); -+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR -+ ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); -+#else -+ assert(0); /* shouldn't be called: cparams should've been adjusted. */ -+#endif - break; - - case ZSTD_greedy: - case ZSTD_lazy: - case ZSTD_lazy2: -+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) - assert(srcSize >= HASH_READ_SIZE); - if (ms->dedicatedDictSearch) { - assert(ms->chainTable != NULL); -@@ -4174,7 +4788,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, - } else { - assert(params->useRowMatchFinder != ZSTD_ps_auto); - if (params->useRowMatchFinder == ZSTD_ps_enable) { -- size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); -+ size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog); - ZSTD_memset(ms->tagTable, 0, tagTableSize); - ZSTD_row_update(ms, iend-HASH_READ_SIZE); - DEBUGLOG(4, "Using row-based hash table for lazy dict"); -@@ -4183,14 +4797,23 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, - DEBUGLOG(4, "Using chain-based hash table for lazy dict"); - } - } -+#else -+ assert(0); /* shouldn't be called: cparams should've been adjusted. */ -+#endif - break; - - case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ - case ZSTD_btopt: - case ZSTD_btultra: - case ZSTD_btultra2: -+#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) - assert(srcSize >= HASH_READ_SIZE); - ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); -+#else -+ assert(0); /* shouldn't be called: cparams should've been adjusted. */ -+#endif - break; - - default: -@@ -4237,11 +4860,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, - - /* We only set the loaded table as valid if it contains all non-zero - * weights. Otherwise, we set it to check */ -- if (!hasZeroWeights) -+ if (!hasZeroWeights && maxSymbolValue == 255) - bs->entropy.huf.repeatMode = HUF_repeat_valid; - - RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); -- RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, ""); - dictPtr += hufHeaderSize; - } - -@@ -4327,6 +4949,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, - ZSTD_CCtx_params const* params, - const void* dict, size_t dictSize, - ZSTD_dictTableLoadMethod_e dtlm, -+ ZSTD_tableFillPurpose_e tfp, - void* workspace) - { - const BYTE* dictPtr = (const BYTE*)dict; -@@ -4345,7 +4968,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, - { - size_t const dictContentSize = (size_t)(dictEnd - dictPtr); - FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( -- ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), ""); -+ ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), ""); - } - return dictID; - } -@@ -4361,6 +4984,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, - const void* dict, size_t dictSize, - ZSTD_dictContentType_e dictContentType, - ZSTD_dictTableLoadMethod_e dtlm, -+ ZSTD_tableFillPurpose_e tfp, - void* workspace) - { - DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); -@@ -4373,13 +4997,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, - - /* dict restricted modes */ - if (dictContentType == ZSTD_dct_rawContent) -- return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm); -+ return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp); - - if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { - if (dictContentType == ZSTD_dct_auto) { - DEBUGLOG(4, "raw content dictionary detected"); - return ZSTD_loadDictionaryContent( -- ms, ls, ws, params, dict, dictSize, dtlm); -+ ms, ls, ws, params, dict, dictSize, dtlm, tfp); - } - RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); - assert(0); /* impossible */ -@@ -4387,13 +5011,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, - - /* dict as full zstd dictionary */ - return ZSTD_loadZstdDictionary( -- bs, ms, ws, params, dict, dictSize, dtlm, workspace); -+ bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace); - } - - #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) - #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL) - - /*! ZSTD_compressBegin_internal() : -+ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both - * @return : 0, or an error code */ - static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, - const void* dict, size_t dictSize, -@@ -4426,11 +5051,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, - cctx->blockState.prevCBlock, &cctx->blockState.matchState, - &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, - cdict->dictContentSize, cdict->dictContentType, dtlm, -- cctx->entropyWorkspace) -+ ZSTD_tfp_forCCtx, cctx->entropyWorkspace) - : ZSTD_compress_insertDictionary( - cctx->blockState.prevCBlock, &cctx->blockState.matchState, - &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, -- dictContentType, dtlm, cctx->entropyWorkspace); -+ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace); - FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); - assert(dictID <= UINT_MAX); - cctx->dictID = (U32)dictID; -@@ -4471,11 +5096,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, - &cctxParams, pledgedSrcSize); - } - --size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) -+static size_t -+ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) - { - ZSTD_CCtx_params cctxParams; -- { -- ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); -+ { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); - ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); - } - DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); -@@ -4483,9 +5108,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di - &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); - } - -+size_t -+ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) -+{ -+ return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel); -+} -+ - size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) - { -- return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); -+ return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel); - } - - -@@ -4496,14 +5127,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) - { - BYTE* const ostart = (BYTE*)dst; - BYTE* op = ostart; -- size_t fhSize = 0; - - DEBUGLOG(4, "ZSTD_writeEpilogue"); - RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); - - /* special case : empty frame */ - if (cctx->stage == ZSTDcs_init) { -- fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); -+ size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); - FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); - dstCapacity -= fhSize; - op += fhSize; -@@ -4513,8 +5143,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) - if (cctx->stage != ZSTDcs_ending) { - /* write one last empty block, make it the "last" block */ - U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; -- RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue"); -- MEM_writeLE32(op, cBlockHeader24); -+ ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3); -+ RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue"); -+ MEM_writeLE24(op, cBlockHeader24); - op += ZSTD_blockHeaderSize; - dstCapacity -= ZSTD_blockHeaderSize; - } -@@ -4537,9 +5168,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) - (void)extraCSize; - } - --size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, -- void* dst, size_t dstCapacity, -- const void* src, size_t srcSize) -+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, -+ void* dst, size_t dstCapacity, -+ const void* src, size_t srcSize) - { - size_t endResult; - size_t const cSize = ZSTD_compressContinue_internal(cctx, -@@ -4563,6 +5194,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, - return cSize + endResult; - } - -+/* NOTE: Must just wrap ZSTD_compressEnd_public() */ -+size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, -+ void* dst, size_t dstCapacity, -+ const void* src, size_t srcSize) -+{ -+ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); -+} -+ - size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, -@@ -4591,7 +5230,7 @@ size_t ZSTD_compress_advanced_internal( - FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, - dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, - params, srcSize, ZSTDb_not_buffered) , ""); -- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); -+ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); - } - - size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, -@@ -4709,7 +5348,7 @@ static size_t ZSTD_initCDict_internal( - { size_t const dictID = ZSTD_compress_insertDictionary( - &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, - ¶ms, cdict->dictContent, cdict->dictContentSize, -- dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace); -+ dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace); - FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); - assert(dictID <= (size_t)(U32)-1); - cdict->dictID = (U32)dictID; -@@ -4813,7 +5452,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( - if (!cdict) - return NULL; - -- if (ZSTD_isError( ZSTD_initCDict_internal(cdict, -+ if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict, - dict, dictSize, - dictLoadMethod, dictContentType, - cctxParams) )) { -@@ -4908,6 +5547,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( - params.cParams = cParams; - params.useRowMatchFinder = useRowMatchFinder; - cdict->useRowMatchFinder = useRowMatchFinder; -+ cdict->compressionLevel = ZSTD_NO_CLEVEL; - - if (ZSTD_isError( ZSTD_initCDict_internal(cdict, - dict, dictSize, -@@ -4987,12 +5627,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced( - - /* ZSTD_compressBegin_usingCDict() : - * cdict must be != NULL */ --size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) -+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) - { - ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; - return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); - } - -+size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) -+{ -+ return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict); -+} -+ - /*! ZSTD_compress_usingCDict_internal(): - * Implementation of various ZSTD_compress_usingCDict* functions. - */ -@@ -5002,7 +5647,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, - const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) - { - FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ -- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); -+ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); - } - - /*! ZSTD_compress_usingCDict_advanced(): -@@ -5199,30 +5844,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) - - static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) - { -- size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; -- if (hintInSize==0) hintInSize = cctx->blockSize; -- return hintInSize; -+ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { -+ return cctx->blockSize - cctx->stableIn_notConsumed; -+ } -+ assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); -+ { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; -+ if (hintInSize==0) hintInSize = cctx->blockSize; -+ return hintInSize; -+ } - } - - /* ZSTD_compressStream_generic(): - * internal function for all *compressStream*() variants -- * non-static, because can be called from zstdmt_compress.c -- * @return : hint size for next input */ -+ * @return : hint size for next input to complete ongoing block */ - static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, - ZSTD_outBuffer* output, - ZSTD_inBuffer* input, - ZSTD_EndDirective const flushMode) - { -- const char* const istart = (const char*)input->src; -- const char* const iend = input->size != 0 ? istart + input->size : istart; -- const char* ip = input->pos != 0 ? istart + input->pos : istart; -- char* const ostart = (char*)output->dst; -- char* const oend = output->size != 0 ? ostart + output->size : ostart; -- char* op = output->pos != 0 ? ostart + output->pos : ostart; -+ const char* const istart = (assert(input != NULL), (const char*)input->src); -+ const char* const iend = (istart != NULL) ? istart + input->size : istart; -+ const char* ip = (istart != NULL) ? istart + input->pos : istart; -+ char* const ostart = (assert(output != NULL), (char*)output->dst); -+ char* const oend = (ostart != NULL) ? ostart + output->size : ostart; -+ char* op = (ostart != NULL) ? ostart + output->pos : ostart; - U32 someMoreWork = 1; - - /* check expectations */ -- DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode); -+ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos); -+ assert(zcs != NULL); -+ if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { -+ assert(input->pos >= zcs->stableIn_notConsumed); -+ input->pos -= zcs->stableIn_notConsumed; -+ if (ip) ip -= zcs->stableIn_notConsumed; -+ zcs->stableIn_notConsumed = 0; -+ } - if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { - assert(zcs->inBuff != NULL); - assert(zcs->inBuffSize > 0); -@@ -5231,8 +5887,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, - assert(zcs->outBuff != NULL); - assert(zcs->outBuffSize > 0); - } -- assert(output->pos <= output->size); -+ if (input->src == NULL) assert(input->size == 0); - assert(input->pos <= input->size); -+ if (output->dst == NULL) assert(output->size == 0); -+ assert(output->pos <= output->size); - assert((U32)flushMode <= (U32)ZSTD_e_end); - - while (someMoreWork) { -@@ -5247,7 +5905,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, - || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ - && (zcs->inBuffPos == 0) ) { - /* shortcut to compression pass directly into output buffer */ -- size_t const cSize = ZSTD_compressEnd(zcs, -+ size_t const cSize = ZSTD_compressEnd_public(zcs, - op, oend-op, ip, iend-ip); - DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); - FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); -@@ -5264,8 +5922,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, - zcs->inBuff + zcs->inBuffPos, toLoad, - ip, iend-ip); - zcs->inBuffPos += loaded; -- if (loaded != 0) -- ip += loaded; -+ if (ip) ip += loaded; - if ( (flushMode == ZSTD_e_continue) - && (zcs->inBuffPos < zcs->inBuffTarget) ) { - /* not enough input to fill full block : stop here */ -@@ -5276,6 +5933,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, - /* empty */ - someMoreWork = 0; break; - } -+ } else { -+ assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); -+ if ( (flushMode == ZSTD_e_continue) -+ && ( (size_t)(iend - ip) < zcs->blockSize) ) { -+ /* can't compress a full block : stop here */ -+ zcs->stableIn_notConsumed = (size_t)(iend - ip); -+ ip = iend; /* pretend to have consumed input */ -+ someMoreWork = 0; break; -+ } -+ if ( (flushMode == ZSTD_e_flush) -+ && (ip == iend) ) { -+ /* empty */ -+ someMoreWork = 0; break; -+ } - } - /* compress current block (note : this stage cannot be stopped in the middle) */ - DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); -@@ -5283,9 +5954,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, - void* cDst; - size_t cSize; - size_t oSize = oend-op; -- size_t const iSize = inputBuffered -- ? zcs->inBuffPos - zcs->inToCompress -- : MIN((size_t)(iend - ip), zcs->blockSize); -+ size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress -+ : MIN((size_t)(iend - ip), zcs->blockSize); - if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) - cDst = op; /* compress into output buffer, to skip flush stage */ - else -@@ -5293,9 +5963,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, - if (inputBuffered) { - unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); - cSize = lastBlock ? -- ZSTD_compressEnd(zcs, cDst, oSize, -+ ZSTD_compressEnd_public(zcs, cDst, oSize, - zcs->inBuff + zcs->inToCompress, iSize) : -- ZSTD_compressContinue(zcs, cDst, oSize, -+ ZSTD_compressContinue_public(zcs, cDst, oSize, - zcs->inBuff + zcs->inToCompress, iSize); - FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); - zcs->frameEnded = lastBlock; -@@ -5308,19 +5978,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, - if (!lastBlock) - assert(zcs->inBuffTarget <= zcs->inBuffSize); - zcs->inToCompress = zcs->inBuffPos; -- } else { -- unsigned const lastBlock = (ip + iSize == iend); -- assert(flushMode == ZSTD_e_end /* Already validated */); -+ } else { /* !inputBuffered, hence ZSTD_bm_stable */ -+ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); - cSize = lastBlock ? -- ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) : -- ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize); -+ ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) : -+ ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize); - /* Consume the input prior to error checking to mirror buffered mode. */ -- if (iSize > 0) -- ip += iSize; -+ if (ip) ip += iSize; - FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); - zcs->frameEnded = lastBlock; -- if (lastBlock) -- assert(ip == iend); -+ if (lastBlock) assert(ip == iend); - } - if (cDst == op) { /* no need to flush */ - op += cSize; -@@ -5390,8 +6057,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf - /* After a compression call set the expected input/output buffer. - * This is validated at the start of the next compression call. - */ --static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input) -+static void -+ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) - { -+ DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)"); - if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { - cctx->expectedInBuffer = *input; - } -@@ -5410,22 +6079,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, - { - if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { - ZSTD_inBuffer const expect = cctx->expectedInBuffer; -- if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size) -- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!"); -- if (endOp != ZSTD_e_end) -- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!"); -+ if (expect.src != input->src || expect.pos != input->pos) -+ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!"); - } -+ (void)endOp; - if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { - size_t const outBufferSize = output->size - output->pos; - if (cctx->expectedOutBufferSize != outBufferSize) -- RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!"); -+ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!"); - } - return 0; - } - - static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, - ZSTD_EndDirective endOp, -- size_t inSize) { -+ size_t inSize) -+{ - ZSTD_CCtx_params params = cctx->requestedParams; - ZSTD_prefixDict const prefixDict = cctx->prefixDict; - FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ -@@ -5439,9 +6108,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, - params.compressionLevel = cctx->cdict->compressionLevel; - } - DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); -- if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ -- { -- size_t const dictSize = prefixDict.dict -+ if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ -+ -+ { size_t const dictSize = prefixDict.dict - ? prefixDict.dictSize - : (cctx->cdict ? cctx->cdict->dictContentSize : 0); - ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); -@@ -5453,6 +6122,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, - params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); - params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); - params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); -+ params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); -+ params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize); -+ params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel); - - { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; - assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); -@@ -5479,6 +6151,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, - return 0; - } - -+/* @return provides a minimum amount of data remaining to be flushed from internal buffers -+ */ - size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, - ZSTD_outBuffer* output, - ZSTD_inBuffer* input, -@@ -5493,8 +6167,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, - - /* transparent initialization stage */ - if (cctx->streamStage == zcss_init) { -- FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed"); -- ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ -+ size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */ -+ size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed; -+ if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */ -+ && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */ -+ && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */ -+ if (cctx->stableIn_notConsumed) { /* not the first time */ -+ /* check stable source guarantees */ -+ RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer"); -+ RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos"); -+ } -+ /* pretend input was consumed, to give a sense forward progress */ -+ input->pos = input->size; -+ /* save stable inBuffer, for later control, and flush/end */ -+ cctx->expectedInBuffer = *input; -+ /* but actually input wasn't consumed, so keep track of position from where compression shall resume */ -+ cctx->stableIn_notConsumed += inputSize; -+ /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */ -+ return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */ -+ } -+ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed"); -+ ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ - } - /* end of transparent initialization stage */ - -@@ -5512,13 +6205,20 @@ size_t ZSTD_compressStream2_simpleArgs ( - const void* src, size_t srcSize, size_t* srcPos, - ZSTD_EndDirective endOp) - { -- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; -- ZSTD_inBuffer input = { src, srcSize, *srcPos }; -+ ZSTD_outBuffer output; -+ ZSTD_inBuffer input; -+ output.dst = dst; -+ output.size = dstCapacity; -+ output.pos = *dstPos; -+ input.src = src; -+ input.size = srcSize; -+ input.pos = *srcPos; - /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ -- size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); -- *dstPos = output.pos; -- *srcPos = input.pos; -- return cErr; -+ { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); -+ *dstPos = output.pos; -+ *srcPos = input.pos; -+ return cErr; -+ } - } - - size_t ZSTD_compress2(ZSTD_CCtx* cctx, -@@ -5541,6 +6241,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, - /* Reset to the original values. */ - cctx->requestedParams.inBufferMode = originalInBufferMode; - cctx->requestedParams.outBufferMode = originalOutBufferMode; -+ - FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); - if (result != 0) { /* compression not completed, due to lack of output space */ - assert(oPos == dstCapacity); -@@ -5551,64 +6252,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, - } - } - --typedef struct { -- U32 idx; /* Index in array of ZSTD_Sequence */ -- U32 posInSequence; /* Position within sequence at idx */ -- size_t posInSrc; /* Number of bytes given by sequences provided so far */ --} ZSTD_sequencePosition; -- - /* ZSTD_validateSequence() : - * @offCode : is presumed to follow format required by ZSTD_storeSeq() - * @returns a ZSTD error code if sequence is not valid - */ - static size_t --ZSTD_validateSequence(U32 offCode, U32 matchLength, -- size_t posInSrc, U32 windowLog, size_t dictSize) -+ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch, -+ size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) - { -- U32 const windowSize = 1 << windowLog; -+ U32 const windowSize = 1u << windowLog; - /* posInSrc represents the amount of data the decoder would decode up to this point. - * As long as the amount of data decoded is less than or equal to window size, offsets may be - * larger than the total length of output decoded in order to reference the dict, even larger than - * window size. After output surpasses windowSize, we're limited to windowSize offsets again. - */ - size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; -- RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); -- RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small"); -+ size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; -+ RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); -+ /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ -+ RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); - return 0; - } - - /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ --static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) -+static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) - { -- U32 offCode = STORE_OFFSET(rawOffset); -+ U32 offBase = OFFSET_TO_OFFBASE(rawOffset); - - if (!ll0 && rawOffset == rep[0]) { -- offCode = STORE_REPCODE_1; -+ offBase = REPCODE1_TO_OFFBASE; - } else if (rawOffset == rep[1]) { -- offCode = STORE_REPCODE(2 - ll0); -+ offBase = REPCODE_TO_OFFBASE(2 - ll0); - } else if (rawOffset == rep[2]) { -- offCode = STORE_REPCODE(3 - ll0); -+ offBase = REPCODE_TO_OFFBASE(3 - ll0); - } else if (ll0 && rawOffset == rep[0] - 1) { -- offCode = STORE_REPCODE_3; -+ offBase = REPCODE3_TO_OFFBASE; - } -- return offCode; -+ return offBase; - } - --/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of -- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. -- */ --static size_t -+size_t - ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, - ZSTD_sequencePosition* seqPos, - const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, -- const void* src, size_t blockSize) -+ const void* src, size_t blockSize, -+ ZSTD_paramSwitch_e externalRepSearch) - { - U32 idx = seqPos->idx; -+ U32 const startIdx = idx; - BYTE const* ip = (BYTE const*)(src); - const BYTE* const iend = ip + blockSize; - repcodes_t updatedRepcodes; - U32 dictSize; - -+ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize); -+ - if (cctx->cdict) { - dictSize = (U32)cctx->cdict->dictContentSize; - } else if (cctx->prefixDict.dict) { -@@ -5617,25 +6315,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, - dictSize = 0; - } - ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); -- for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { -+ for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { - U32 const litLength = inSeqs[idx].litLength; -- U32 const ll0 = (litLength == 0); - U32 const matchLength = inSeqs[idx].matchLength; -- U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); -- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); -+ U32 offBase; -+ -+ if (externalRepSearch == ZSTD_ps_disable) { -+ offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset); -+ } else { -+ U32 const ll0 = (litLength == 0); -+ offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0); -+ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); -+ } - -- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); -+ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); - if (cctx->appliedParams.validateSequences) { - seqPos->posInSrc += litLength + matchLength; -- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, -- cctx->appliedParams.cParams.windowLog, dictSize), -+ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, -+ cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), - "Sequence validation failed"); - } -- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, -+ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, - "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); -- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); -+ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); - ip += matchLength + litLength; - } -+ -+ /* If we skipped repcode search while parsing, we need to update repcodes now */ -+ assert(externalRepSearch != ZSTD_ps_auto); -+ assert(idx >= startIdx); -+ if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) { -+ U32* const rep = updatedRepcodes.rep; -+ U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */ -+ -+ if (lastSeqIdx >= startIdx + 2) { -+ rep[2] = inSeqs[lastSeqIdx - 2].offset; -+ rep[1] = inSeqs[lastSeqIdx - 1].offset; -+ rep[0] = inSeqs[lastSeqIdx].offset; -+ } else if (lastSeqIdx == startIdx + 1) { -+ rep[2] = rep[0]; -+ rep[1] = inSeqs[lastSeqIdx - 1].offset; -+ rep[0] = inSeqs[lastSeqIdx].offset; -+ } else { -+ assert(lastSeqIdx == startIdx); -+ rep[2] = rep[1]; -+ rep[1] = rep[0]; -+ rep[0] = inSeqs[lastSeqIdx].offset; -+ } -+ } -+ - ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); - - if (inSeqs[idx].litLength) { -@@ -5644,26 +6372,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, - ip += inSeqs[idx].litLength; - seqPos->posInSrc += inSeqs[idx].litLength; - } -- RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!"); -+ RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); - seqPos->idx = idx+1; - return 0; - } - --/* Returns the number of bytes to move the current read position back by. Only non-zero -- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something -- * went wrong. -- * -- * This function will attempt to scan through blockSize bytes represented by the sequences -- * in inSeqs, storing any (partial) sequences. -- * -- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to -- * avoid splitting a match, or to avoid splitting a match such that it would produce a match -- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. -- */ --static size_t -+size_t - ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, - const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, -- const void* src, size_t blockSize) -+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch) - { - U32 idx = seqPos->idx; - U32 startPosInSequence = seqPos->posInSequence; -@@ -5675,6 +6392,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* - U32 bytesAdjustment = 0; - U32 finalMatchSplit = 0; - -+ /* TODO(embg) support fast parsing mode in noBlockDelim mode */ -+ (void)externalRepSearch; -+ - if (cctx->cdict) { - dictSize = cctx->cdict->dictContentSize; - } else if (cctx->prefixDict.dict) { -@@ -5682,7 +6402,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* - } else { - dictSize = 0; - } -- DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); -+ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); - DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); - ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); - while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { -@@ -5690,7 +6410,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* - U32 litLength = currSeq.litLength; - U32 matchLength = currSeq.matchLength; - U32 const rawOffset = currSeq.offset; -- U32 offCode; -+ U32 offBase; - - /* Modify the sequence depending on where endPosInSequence lies */ - if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { -@@ -5704,7 +6424,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* - /* Move to the next sequence */ - endPosInSequence -= currSeq.litLength + currSeq.matchLength; - startPosInSequence = 0; -- idx++; - } else { - /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence - does not reach the end of the match. So, we have to split the sequence */ -@@ -5744,21 +6463,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* - } - /* Check if this offset can be represented with a repcode */ - { U32 const ll0 = (litLength == 0); -- offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0); -- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); -+ offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0); -+ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); - } - - if (cctx->appliedParams.validateSequences) { - seqPos->posInSrc += litLength + matchLength; -- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, -- cctx->appliedParams.cParams.windowLog, dictSize), -+ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, -+ cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), - "Sequence validation failed"); - } -- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); -- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, -+ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); -+ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, - "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); -- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); -+ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); - ip += matchLength + litLength; -+ if (!finalMatchSplit) -+ idx++; /* Next Sequence */ - } - DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); - assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); -@@ -5781,7 +6502,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* - - typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, - const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, -- const void* src, size_t blockSize); -+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); - static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) - { - ZSTD_sequenceCopier sequenceCopier = NULL; -@@ -5795,6 +6516,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) - return sequenceCopier; - } - -+/* Discover the size of next block by searching for the delimiter. -+ * Note that a block delimiter **must** exist in this mode, -+ * otherwise it's an input error. -+ * The block size retrieved will be later compared to ensure it remains within bounds */ -+static size_t -+blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) -+{ -+ int end = 0; -+ size_t blockSize = 0; -+ size_t spos = seqPos.idx; -+ DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize); -+ assert(spos <= inSeqsSize); -+ while (spos < inSeqsSize) { -+ end = (inSeqs[spos].offset == 0); -+ blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength; -+ if (end) { -+ if (inSeqs[spos].matchLength != 0) -+ RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0"); -+ break; -+ } -+ spos++; -+ } -+ if (!end) -+ RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter"); -+ return blockSize; -+} -+ -+/* More a "target" block size */ -+static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining) -+{ -+ int const lastBlock = (remaining <= blockSize); -+ return lastBlock ? remaining : blockSize; -+} -+ -+static size_t determine_blockSize(ZSTD_sequenceFormat_e mode, -+ size_t blockSize, size_t remaining, -+ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) -+{ -+ DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); -+ if (mode == ZSTD_sf_noBlockDelimiters) -+ return blockSize_noDelimiter(blockSize, remaining); -+ { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); -+ FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); -+ if (explicitBlockSize > blockSize) -+ RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block"); -+ if (explicitBlockSize > remaining) -+ RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source"); -+ return explicitBlockSize; -+ } -+} -+ - /* Compress, block-by-block, all of the sequences given. - * - * Returns the cumulative size of all compressed blocks (including their headers), -@@ -5807,9 +6579,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, - const void* src, size_t srcSize) - { - size_t cSize = 0; -- U32 lastBlock; -- size_t blockSize; -- size_t compressedSeqsSize; - size_t remaining = srcSize; - ZSTD_sequencePosition seqPos = {0, 0, 0}; - -@@ -5829,22 +6598,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, - } - - while (remaining) { -+ size_t compressedSeqsSize; - size_t cBlockSize; - size_t additionalByteAdjustment; -- lastBlock = remaining <= cctx->blockSize; -- blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize; -+ size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, -+ cctx->blockSize, remaining, -+ inSeqs, inSeqsSize, seqPos); -+ U32 const lastBlock = (blockSize == remaining); -+ FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); -+ assert(blockSize <= remaining); - ZSTD_resetSeqStore(&cctx->seqStore); -- DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize); -+ DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize); - -- additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize); -+ additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes); - FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy"); - blockSize -= additionalByteAdjustment; - - /* If blocks are too small, emit as a nocompress block */ -- if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { -+ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding -+ * additional 1. We need to revisit and change this logic to be more consistent */ -+ if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { - cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); - FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); -- DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); -+ DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); - cSize += cBlockSize; - ip += blockSize; - op += cBlockSize; -@@ -5853,6 +6629,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, - continue; - } - -+ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); - compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, - &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, - &cctx->appliedParams, -@@ -5861,11 +6638,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, - cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, - cctx->bmi2); - FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); -- DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize); -+ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); - - if (!cctx->isFirstBlock && - ZSTD_maybeRLE(&cctx->seqStore) && -- ZSTD_isRLE((BYTE const*)src, srcSize)) { -+ ZSTD_isRLE(ip, blockSize)) { - /* We don't want to emit our first block as a RLE even if it qualifies because - * doing so will cause the decoder (cli only) to throw a "should consume all input error." - * This is only an issue for zstd <= v1.4.3 -@@ -5876,12 +6653,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, - if (compressedSeqsSize == 0) { - /* ZSTD_noCompressBlock writes the block header as well */ - cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); -- FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); -- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize); -+ FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed"); -+ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize); - } else if (compressedSeqsSize == 1) { - cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock); -- FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed"); -- DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize); -+ FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed"); -+ DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize); - } else { - U32 cBlockHeader; - /* Error checking and repcodes update */ -@@ -5893,11 +6670,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, - cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); - MEM_writeLE24(op, cBlockHeader); - cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; -- DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize); -+ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); - } - - cSize += cBlockSize; -- DEBUGLOG(4, "cSize running total: %zu", cSize); - - if (lastBlock) { - break; -@@ -5908,12 +6684,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, - dstCapacity -= cBlockSize; - cctx->isFirstBlock = 0; - } -+ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); - } - -+ DEBUGLOG(4, "cSize final total: %zu", cSize); - return cSize; - } - --size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity, -+size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, -+ void* dst, size_t dstCapacity, - const ZSTD_Sequence* inSeqs, size_t inSeqsSize, - const void* src, size_t srcSize) - { -@@ -5923,7 +6702,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci - size_t frameHeaderSize = 0; - - /* Transparent initialization stage, same as compressStream2() */ -- DEBUGLOG(3, "ZSTD_compressSequences()"); -+ DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity); - assert(cctx != NULL); - FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); - /* Begin writing output, starting with frame header */ -@@ -5951,26 +6730,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci - cSize += 4; - } - -- DEBUGLOG(3, "Final compressed size: %zu", cSize); -+ DEBUGLOG(4, "Final compressed size: %zu", cSize); - return cSize; - } - - /*====== Finalize ======*/ - -+static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) -+{ -+ const ZSTD_inBuffer nullInput = { NULL, 0, 0 }; -+ const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable); -+ return stableInput ? zcs->expectedInBuffer : nullInput; -+} -+ - /*! ZSTD_flushStream() : - * @return : amount of data remaining to flush */ - size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) - { -- ZSTD_inBuffer input = { NULL, 0, 0 }; -+ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); -+ input.size = input.pos; /* do not ingest more input during flush */ - return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); - } - - - size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) - { -- ZSTD_inBuffer input = { NULL, 0, 0 }; -+ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); - size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); -- FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed"); -+ FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed"); - if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ - /* single thread mode : attempt to calculate remaining to flush more precisely */ - { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; -@@ -6092,7 +6879,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, - cp.targetLength = (unsigned)(-clampedCompressionLevel); - } - /* refine parameters based on srcSize & dictSize */ -- return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode); -+ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto); - } - } - -@@ -6127,3 +6914,29 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH - if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; - return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); - } -+ -+void ZSTD_registerSequenceProducer( -+ ZSTD_CCtx* zc, -+ void* extSeqProdState, -+ ZSTD_sequenceProducer_F extSeqProdFunc -+) { -+ assert(zc != NULL); -+ ZSTD_CCtxParams_registerSequenceProducer( -+ &zc->requestedParams, extSeqProdState, extSeqProdFunc -+ ); -+} -+ -+void ZSTD_CCtxParams_registerSequenceProducer( -+ ZSTD_CCtx_params* params, -+ void* extSeqProdState, -+ ZSTD_sequenceProducer_F extSeqProdFunc -+) { -+ assert(params != NULL); -+ if (extSeqProdFunc != NULL) { -+ params->extSeqProdFunc = extSeqProdFunc; -+ params->extSeqProdState = extSeqProdState; -+ } else { -+ params->extSeqProdFunc = NULL; -+ params->extSeqProdState = NULL; -+ } -+} -diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h -index 71697a11ae30..53cb582a8d2b 100644 ---- a/lib/zstd/compress/zstd_compress_internal.h -+++ b/lib/zstd/compress/zstd_compress_internal.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -20,6 +21,7 @@ - ***************************************/ - #include "../common/zstd_internal.h" - #include "zstd_cwksp.h" -+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */ - - - /*-************************************* -@@ -32,7 +34,7 @@ - It's not a big deal though : candidate will just be sorted again. - Additionally, candidate position 1 will be lost. - But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. -- The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy. -+ The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy. - This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */ - - -@@ -111,12 +113,13 @@ typedef struct { - /* ZSTD_buildBlockEntropyStats() : - * Builds entropy for the block. - * @return : 0 on success or error code */ --size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, -- const ZSTD_entropyCTables_t* prevEntropy, -- ZSTD_entropyCTables_t* nextEntropy, -- const ZSTD_CCtx_params* cctxParams, -- ZSTD_entropyCTablesMetadata_t* entropyMetadata, -- void* workspace, size_t wkspSize); -+size_t ZSTD_buildBlockEntropyStats( -+ const seqStore_t* seqStorePtr, -+ const ZSTD_entropyCTables_t* prevEntropy, -+ ZSTD_entropyCTables_t* nextEntropy, -+ const ZSTD_CCtx_params* cctxParams, -+ ZSTD_entropyCTablesMetadata_t* entropyMetadata, -+ void* workspace, size_t wkspSize); - - /* ******************************* - * Compression internals structs * -@@ -142,26 +145,33 @@ typedef struct { - size_t capacity; /* The capacity starting from `seq` pointer */ - } rawSeqStore_t; - -+typedef struct { -+ U32 idx; /* Index in array of ZSTD_Sequence */ -+ U32 posInSequence; /* Position within sequence at idx */ -+ size_t posInSrc; /* Number of bytes given by sequences provided so far */ -+} ZSTD_sequencePosition; -+ - UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; - - typedef struct { -- int price; -- U32 off; -- U32 mlen; -- U32 litlen; -- U32 rep[ZSTD_REP_NUM]; -+ int price; /* price from beginning of segment to this position */ -+ U32 off; /* offset of previous match */ -+ U32 mlen; /* length of previous match */ -+ U32 litlen; /* nb of literals since previous match */ -+ U32 rep[ZSTD_REP_NUM]; /* offset history after previous match */ - } ZSTD_optimal_t; - - typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e; - -+#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3) - typedef struct { - /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ - unsigned* litFreq; /* table of literals statistics, of size 256 */ - unsigned* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ - unsigned* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ - unsigned* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ -- ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ -- ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ -+ ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_SIZE */ -+ ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */ - - U32 litSum; /* nb of literals */ - U32 litLengthSum; /* nb of litLength codes */ -@@ -212,8 +222,10 @@ struct ZSTD_matchState_t { - U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ - - U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ -- U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ -+ BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ - U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ -+ U64 hashSalt; /* For row-based matchFinder: salts the hash for reuse of tag table */ -+ U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */ - - U32* hashTable; - U32* hashTable3; -@@ -228,6 +240,18 @@ struct ZSTD_matchState_t { - const ZSTD_matchState_t* dictMatchState; - ZSTD_compressionParameters cParams; - const rawSeqStore_t* ldmSeqStore; -+ -+ /* Controls prefetching in some dictMatchState matchfinders. -+ * This behavior is controlled from the cctx ms. -+ * This parameter has no effect in the cdict ms. */ -+ int prefetchCDictTables; -+ -+ /* When == 0, lazy match finders insert every position. -+ * When != 0, lazy match finders only insert positions they search. -+ * This allows them to skip much faster over incompressible data, -+ * at a small cost to compression ratio. -+ */ -+ int lazySkipping; - }; - - typedef struct { -@@ -324,6 +348,25 @@ struct ZSTD_CCtx_params_s { - - /* Internal use, for createCCtxParams() and freeCCtxParams() only */ - ZSTD_customMem customMem; -+ -+ /* Controls prefetching in some dictMatchState matchfinders */ -+ ZSTD_paramSwitch_e prefetchCDictTables; -+ -+ /* Controls whether zstd will fall back to an internal matchfinder -+ * if the external matchfinder returns an error code. */ -+ int enableMatchFinderFallback; -+ -+ /* Parameters for the external sequence producer API. -+ * Users set these parameters through ZSTD_registerSequenceProducer(). -+ * It is not possible to set these parameters individually through the public API. */ -+ void* extSeqProdState; -+ ZSTD_sequenceProducer_F extSeqProdFunc; -+ -+ /* Adjust the max block size*/ -+ size_t maxBlockSize; -+ -+ /* Controls repcode search in external sequence parsing */ -+ ZSTD_paramSwitch_e searchForExternalRepcodes; - }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ - - #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) -@@ -404,6 +447,7 @@ struct ZSTD_CCtx_s { - - /* Stable in/out buffer verification */ - ZSTD_inBuffer expectedInBuffer; -+ size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */ - size_t expectedOutBufferSize; - - /* Dictionary */ -@@ -417,9 +461,14 @@ struct ZSTD_CCtx_s { - - /* Workspace for block splitter */ - ZSTD_blockSplitCtx blockSplitCtx; -+ -+ /* Buffer for output from external sequence producer */ -+ ZSTD_Sequence* extSeqBuf; -+ size_t extSeqBufCapacity; - }; - - typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; -+typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e; - - typedef enum { - ZSTD_noDict = 0, -@@ -441,7 +490,7 @@ typedef enum { - * In this mode we take both the source size and the dictionary size - * into account when selecting and adjusting the parameters. - */ -- ZSTD_cpm_unknown = 3, /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. -+ ZSTD_cpm_unknown = 3 /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. - * We don't know what these parameters are for. We default to the legacy - * behavior of taking both the source size and the dict size into account - * when selecting and adjusting parameters. -@@ -500,9 +549,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) - /* ZSTD_noCompressBlock() : - * Writes uncompressed block to dst buffer from given src. - * Returns the size of the block */ --MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) -+MEM_STATIC size_t -+ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) - { - U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); -+ DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity); - RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, - dstSize_tooSmall, "dst buf too small for uncompressed block"); - MEM_writeLE24(dst, cBlockHeader24); -@@ -510,7 +561,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi - return ZSTD_blockHeaderSize + srcSize; - } - --MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) -+MEM_STATIC size_t -+ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) - { - BYTE* const op = (BYTE*)dst; - U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); -@@ -529,7 +581,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) - { - U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; - ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); -- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); -+ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); - return (srcSize >> minlog) + 2; - } - -@@ -565,29 +617,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con - while (ip < iend) *op++ = *ip++; - } - --#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) --#define STORE_REPCODE_1 STORE_REPCODE(1) --#define STORE_REPCODE_2 STORE_REPCODE(2) --#define STORE_REPCODE_3 STORE_REPCODE(3) --#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1) --#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE) --#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE) --#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE) --#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) --#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */ --#define STORED_TO_OFFBASE(o) ((o)+1) --#define OFFBASE_TO_STORED(o) ((o)-1) -+ -+#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1) -+#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2) -+#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3) -+#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */ -+#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM) -+#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM) -+#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM) -+#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM) -+#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */ - - /*! ZSTD_storeSeq() : -- * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. -- * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). -+ * Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t. -+ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). - * @matchLength : must be >= MINMATCH -- * Allowed to overread literals up to litLimit. -+ * Allowed to over-read literals up to litLimit. - */ - HINT_INLINE UNUSED_ATTR void - ZSTD_storeSeq(seqStore_t* seqStorePtr, - size_t litLength, const BYTE* literals, const BYTE* litLimit, -- U32 offBase_minus1, -+ U32 offBase, - size_t matchLength) - { - BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; -@@ -596,8 +646,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, - static const BYTE* g_start = NULL; - if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ - { U32 const pos = (U32)((const BYTE*)literals - g_start); -- DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", -- pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1); -+ DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u", -+ pos, (U32)litLength, (U32)matchLength, (U32)offBase); - } - #endif - assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); -@@ -607,9 +657,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, - assert(literals + litLength <= litLimit); - if (litEnd <= litLimit_w) { - /* Common case we can use wildcopy. -- * First copy 16 bytes, because literals are likely short. -- */ -- assert(WILDCOPY_OVERLENGTH >= 16); -+ * First copy 16 bytes, because literals are likely short. -+ */ -+ ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16); - ZSTD_copy16(seqStorePtr->lit, literals); - if (litLength > 16) { - ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); -@@ -628,7 +678,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, - seqStorePtr->sequences[0].litLength = (U16)litLength; - - /* match offset */ -- seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1); -+ seqStorePtr->sequences[0].offBase = offBase; - - /* match Length */ - assert(matchLength >= MINMATCH); -@@ -646,17 +696,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, - - /* ZSTD_updateRep() : - * updates in-place @rep (array of repeat offsets) -- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq() -+ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq() - */ - MEM_STATIC void --ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) -+ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) - { -- if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */ -+ if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */ - rep[2] = rep[1]; - rep[1] = rep[0]; -- rep[0] = STORED_OFFSET(offBase_minus1); -+ rep[0] = OFFBASE_TO_OFFSET(offBase); - } else { /* repcode */ -- U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0; -+ U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; - if (repCode > 0) { /* note : if repCode==0, no change */ - U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; - rep[2] = (repCode >= 2) ? rep[1] : rep[2]; -@@ -673,11 +723,11 @@ typedef struct repcodes_s { - } repcodes_t; - - MEM_STATIC repcodes_t --ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) -+ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) - { - repcodes_t newReps; - ZSTD_memcpy(&newReps, rep, sizeof(newReps)); -- ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); -+ ZSTD_updateRep(newReps.rep, offBase, ll0); - return newReps; - } - -@@ -685,59 +735,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0 - /*-************************************* - * Match length counter - ***************************************/ --static unsigned ZSTD_NbCommonBytes (size_t val) --{ -- if (MEM_isLittleEndian()) { -- if (MEM_64bits()) { --# if (__GNUC__ >= 4) -- return (__builtin_ctzll((U64)val) >> 3); --# else -- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, -- 0, 3, 1, 3, 1, 4, 2, 7, -- 0, 2, 3, 6, 1, 5, 3, 5, -- 1, 3, 4, 4, 2, 5, 6, 7, -- 7, 0, 1, 2, 3, 3, 4, 6, -- 2, 6, 5, 5, 3, 4, 5, 6, -- 7, 1, 2, 4, 6, 4, 4, 5, -- 7, 2, 6, 5, 7, 6, 7, 7 }; -- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; --# endif -- } else { /* 32 bits */ --# if (__GNUC__ >= 3) -- return (__builtin_ctz((U32)val) >> 3); --# else -- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, -- 3, 2, 2, 1, 3, 2, 0, 1, -- 3, 3, 1, 2, 2, 2, 2, 0, -- 3, 1, 2, 0, 1, 0, 1, 1 }; -- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; --# endif -- } -- } else { /* Big Endian CPU */ -- if (MEM_64bits()) { --# if (__GNUC__ >= 4) -- return (__builtin_clzll(val) >> 3); --# else -- unsigned r; -- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ -- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } -- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } -- r += (!val); -- return r; --# endif -- } else { /* 32 bits */ --# if (__GNUC__ >= 3) -- return (__builtin_clz((U32)val) >> 3); --# else -- unsigned r; -- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } -- r += (!val); -- return r; --# endif -- } } --} -- -- - MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) - { - const BYTE* const pStart = pIn; -@@ -783,32 +780,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, - * Hashes - ***************************************/ - static const U32 prime3bytes = 506832829U; --static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; } --MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ -+static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; } -+MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */ -+MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); } - - static const U32 prime4bytes = 2654435761U; --static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } --static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } -+static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; } -+static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); } -+static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); } - - static const U64 prime5bytes = 889523592379ULL; --static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } --static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } -+static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; } -+static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); } -+static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); } - - static const U64 prime6bytes = 227718039650203ULL; --static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } --static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } -+static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; } -+static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); } -+static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); } - - static const U64 prime7bytes = 58295818150454627ULL; --static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } --static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } -+static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; } -+static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); } -+static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); } - - static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; --static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } --static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } -+static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; } -+static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); } -+static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); } -+ - - MEM_STATIC FORCE_INLINE_ATTR - size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) - { -+ /* Although some of these hashes do support hBits up to 64, some do not. -+ * To be on the safe side, always avoid hBits > 32. */ -+ assert(hBits <= 32); -+ - switch(mls) - { - default: -@@ -820,6 +828,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) - } - } - -+MEM_STATIC FORCE_INLINE_ATTR -+size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) { -+ /* Although some of these hashes do support hBits up to 64, some do not. -+ * To be on the safe side, always avoid hBits > 32. */ -+ assert(hBits <= 32); -+ -+ switch(mls) -+ { -+ default: -+ case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); -+ case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); -+ case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); -+ case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); -+ case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt); -+ } -+} -+ -+ - /* ZSTD_ipow() : - * Return base^exponent. - */ -@@ -1011,7 +1037,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, - * The least significant cycleLog bits of the indices must remain the same, - * which may be 0. Every index up to maxDist in the past must be valid. - */ --MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, -+MEM_STATIC -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, - U32 maxDist, void const* src) - { - /* preemptive overflow correction: -@@ -1167,10 +1195,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, - (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); - assert(blockEndIdx >= loadedDictEnd); - -- if (blockEndIdx > loadedDictEnd + maxDist) { -+ if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) { - /* On reaching window size, dictionaries are invalidated. - * For simplification, if window size is reached anywhere within next block, - * the dictionary is invalidated for the full block. -+ * -+ * We also have to invalidate the dictionary if ZSTD_window_update() has detected -+ * non-contiguous segments, which means that loadedDictEnd != window->dictLimit. -+ * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use -+ * dictMatchState, so setting it to NULL is not a problem. - */ - DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); - *loadedDictEndPtr = 0; -@@ -1199,7 +1232,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { - * forget about the extDict. Handles overlap of the prefix and extDict. - * Returns non-zero if the segment is contiguous. - */ --MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, -+MEM_STATIC -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+U32 ZSTD_window_update(ZSTD_window_t* window, - void const* src, size_t srcSize, - int forceNonContiguous) - { -@@ -1302,6 +1337,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) - - #endif - -+/* Short Cache */ -+ -+/* Normally, zstd matchfinders follow this flow: -+ * 1. Compute hash at ip -+ * 2. Load index from hashTable[hash] -+ * 3. Check if *ip == *(base + index) -+ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss. -+ * -+ * Short cache is an optimization which allows us to avoid step 3 most of the time -+ * when the data doesn't actually match. With short cache, the flow becomes: -+ * 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip. -+ * 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works. -+ * 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue. -+ * -+ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to -+ * dictMatchState matchfinders. -+ */ -+#define ZSTD_SHORT_CACHE_TAG_BITS 8 -+#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1) -+ -+/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable. -+ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */ -+MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) { -+ size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; -+ U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK); -+ assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0); -+ hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag; -+} -+ -+/* Helper function for short cache matchfinders. -+ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */ -+MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) { -+ U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK; -+ U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK; -+ return tag1 == tag2; -+} - - - /* =============================================================== -@@ -1381,11 +1452,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity); - * This cannot be used when long range matching is enabled. - * Zstd will use these sequences, and pass the literals to a secondary block - * compressor. -- * @return : An error code on failure. - * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory - * access and data corruption. - */ --size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); -+void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); - - /* ZSTD_cycleLog() : - * condition for correct operation : hashLog > 1 */ -@@ -1396,4 +1466,55 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); - */ - void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); - -+/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of -+ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. -+ * Note that the block delimiter must include the last literals of the block. -+ */ -+size_t -+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, -+ ZSTD_sequencePosition* seqPos, -+ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, -+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); -+ -+/* Returns the number of bytes to move the current read position back by. -+ * Only non-zero if we ended up splitting a sequence. -+ * Otherwise, it may return a ZSTD error if something went wrong. -+ * -+ * This function will attempt to scan through blockSize bytes -+ * represented by the sequences in @inSeqs, -+ * storing any (partial) sequences. -+ * -+ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to -+ * avoid splitting a match, or to avoid splitting a match such that it would produce a match -+ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. -+ */ -+size_t -+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, -+ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, -+ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); -+ -+/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */ -+MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) { -+ return params->extSeqProdFunc != NULL; -+} -+ -+/* =============================================================== -+ * Deprecated definitions that are still used internally to avoid -+ * deprecation warnings. These functions are exactly equivalent to -+ * their public variants, but avoid the deprecation warnings. -+ * =============================================================== */ -+ -+size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); -+ -+size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, -+ void* dst, size_t dstCapacity, -+ const void* src, size_t srcSize); -+ -+size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, -+ void* dst, size_t dstCapacity, -+ const void* src, size_t srcSize); -+ -+size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); -+ -+ - #endif /* ZSTD_COMPRESS_H */ -diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c -index 52b0a8059aba..3e9ea46a670a 100644 ---- a/lib/zstd/compress/zstd_compress_literals.c -+++ b/lib/zstd/compress/zstd_compress_literals.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -13,11 +14,36 @@ - ***************************************/ - #include "zstd_compress_literals.h" - -+ -+/* ************************************************************** -+* Debug Traces -+****************************************************************/ -+#if DEBUGLEVEL >= 2 -+ -+static size_t showHexa(const void* src, size_t srcSize) -+{ -+ const BYTE* const ip = (const BYTE*)src; -+ size_t u; -+ for (u=0; u31) + (srcSize>4095); - -+ DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity); -+ - RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); - - switch(flSize) -@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, - } - - ZSTD_memcpy(ostart + flSize, src, srcSize); -- DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); -+ DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); - return srcSize + flSize; - } - -+static int allBytesIdentical(const void* src, size_t srcSize) -+{ -+ assert(srcSize >= 1); -+ assert(src != NULL); -+ { const BYTE b = ((const BYTE*)src)[0]; -+ size_t p; -+ for (p=1; p31) + (srcSize>4095); - -- (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ -+ assert(dstCapacity >= 4); (void)dstCapacity; -+ assert(allBytesIdentical(src, srcSize)); - - switch(flSize) - { -@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* - } - - ostart[flSize] = *(const BYTE*)src; -- DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1); -+ DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1); - return flSize+1; - } - --size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, -- ZSTD_hufCTables_t* nextHuf, -- ZSTD_strategy strategy, int disableLiteralCompression, -- void* dst, size_t dstCapacity, -- const void* src, size_t srcSize, -- void* entropyWorkspace, size_t entropyWorkspaceSize, -- const int bmi2, -- unsigned suspectUncompressible) -+/* ZSTD_minLiteralsToCompress() : -+ * returns minimal amount of literals -+ * for literal compression to even be attempted. -+ * Minimum is made tighter as compression strategy increases. -+ */ -+static size_t -+ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat) -+{ -+ assert((int)strategy >= 0); -+ assert((int)strategy <= 9); -+ /* btultra2 : min 8 bytes; -+ * then 2x larger for each successive compression strategy -+ * max threshold 64 bytes */ -+ { int const shift = MIN(9-(int)strategy, 3); -+ size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift; -+ DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc); -+ return mintc; -+ } -+} -+ -+size_t ZSTD_compressLiterals ( -+ void* dst, size_t dstCapacity, -+ const void* src, size_t srcSize, -+ void* entropyWorkspace, size_t entropyWorkspaceSize, -+ const ZSTD_hufCTables_t* prevHuf, -+ ZSTD_hufCTables_t* nextHuf, -+ ZSTD_strategy strategy, -+ int disableLiteralCompression, -+ int suspectUncompressible, -+ int bmi2) - { -- size_t const minGain = ZSTD_minGain(srcSize, strategy); - size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); - BYTE* const ostart = (BYTE*)dst; - U32 singleStream = srcSize < 256; - symbolEncodingType_e hType = set_compressed; - size_t cLitSize; - -- DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)", -- disableLiteralCompression, (U32)srcSize); -+ DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)", -+ disableLiteralCompression, (U32)srcSize, dstCapacity); -+ -+ DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize)); - - /* Prepare nextEntropy assuming reusing the existing table */ - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); -@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, - if (disableLiteralCompression) - return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); - -- /* small ? don't even attempt compression (speed opt) */ --# define COMPRESS_LITERALS_SIZE_MIN 63 -- { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; -- if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); -- } -+ /* if too small, don't even attempt compression (speed opt) */ -+ if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode)) -+ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); - - RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression"); - { HUF_repeat repeat = prevHuf->repeatMode; -- int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; -+ int const flags = 0 -+ | (bmi2 ? HUF_flags_bmi2 : 0) -+ | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0) -+ | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0) -+ | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0); -+ -+ typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int); -+ huf_compress_f huf_compress; - if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; -- cLitSize = singleStream ? -- HUF_compress1X_repeat( -- ostart+lhSize, dstCapacity-lhSize, src, srcSize, -- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, -- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) : -- HUF_compress4X_repeat( -- ostart+lhSize, dstCapacity-lhSize, src, srcSize, -- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, -- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible); -+ huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat; -+ cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize, -+ src, srcSize, -+ HUF_SYMBOLVALUE_MAX, LitHufLog, -+ entropyWorkspace, entropyWorkspaceSize, -+ (HUF_CElt*)nextHuf->CTable, -+ &repeat, flags); -+ DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize); - if (repeat != HUF_repeat_none) { - /* reused the existing table */ -- DEBUGLOG(5, "Reusing previous huffman table"); -+ DEBUGLOG(5, "reusing statistics from previous huffman block"); - hType = set_repeat; - } - } - -- if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { -- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); -- return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); -- } -+ { size_t const minGain = ZSTD_minGain(srcSize, strategy); -+ if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { -+ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); -+ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); -+ } } - if (cLitSize==1) { -- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); -- return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); -- } -+ /* A return value of 1 signals that the alphabet consists of a single symbol. -+ * However, in some rare circumstances, it could be the compressed size (a single byte). -+ * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`. -+ * (it's also necessary to not generate statistics). -+ * Therefore, in such a case, actively check that all bytes are identical. */ -+ if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) { -+ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); -+ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); -+ } } - - if (hType == set_compressed) { - /* using a newly constructed table */ -@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, - switch(lhSize) - { - case 3: /* 2 - 2 - 10 - 10 */ -- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); -+ if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); -+ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); - MEM_writeLE24(ostart, lhc); - break; - } - case 4: /* 2 - 2 - 14 - 14 */ -+ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); - { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); - MEM_writeLE32(ostart, lhc); - break; - } - case 5: /* 2 - 2 - 18 - 18 */ -+ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); - { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); - MEM_writeLE32(ostart, lhc); - ostart[4] = (BYTE)(cLitSize >> 10); -diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h -index 9775fb97cb70..a2a85d6b69e5 100644 ---- a/lib/zstd/compress/zstd_compress_literals.h -+++ b/lib/zstd/compress/zstd_compress_literals.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -16,16 +17,24 @@ - - size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); - -+/* ZSTD_compressRleLiteralsBlock() : -+ * Conditions : -+ * - All bytes in @src are identical -+ * - dstCapacity >= 4 */ - size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); - --/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ --size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, -- ZSTD_hufCTables_t* nextHuf, -- ZSTD_strategy strategy, int disableLiteralCompression, -- void* dst, size_t dstCapacity, -+/* ZSTD_compressLiterals(): -+ * @entropyWorkspace: must be aligned on 4-bytes boundaries -+ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE -+ * @suspectUncompressible: sampling checks, to potentially skip huffman coding -+ */ -+size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - void* entropyWorkspace, size_t entropyWorkspaceSize, -- const int bmi2, -- unsigned suspectUncompressible); -+ const ZSTD_hufCTables_t* prevHuf, -+ ZSTD_hufCTables_t* nextHuf, -+ ZSTD_strategy strategy, int disableLiteralCompression, -+ int suspectUncompressible, -+ int bmi2); - - #endif /* ZSTD_COMPRESS_LITERALS_H */ -diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c -index 21ddc1b37acf..5c028c78d889 100644 ---- a/lib/zstd/compress/zstd_compress_sequences.c -+++ b/lib/zstd/compress/zstd_compress_sequences.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq) - { - /* Heuristic: This should cover most blocks <= 16K and - * start to fade out after 16K to about 32K depending on -- * comprssibility. -+ * compressibility. - */ - return nbSeq >= 2048; - } -@@ -166,7 +167,7 @@ ZSTD_selectEncodingType( - if (mostFrequent == nbSeq) { - *repeatMode = FSE_repeat_none; - if (isDefaultAllowed && nbSeq <= 2) { -- /* Prefer set_basic over set_rle when there are 2 or less symbols, -+ /* Prefer set_basic over set_rle when there are 2 or fewer symbols, - * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. - * If basic encoding isn't possible, always choose RLE. - */ -diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h -index 7991364c2f71..7fe6f4ff5cf2 100644 ---- a/lib/zstd/compress/zstd_compress_sequences.h -+++ b/lib/zstd/compress/zstd_compress_sequences.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c -index 17d836cc84e8..41f6521b27cd 100644 ---- a/lib/zstd/compress/zstd_compress_superblock.c -+++ b/lib/zstd/compress/zstd_compress_superblock.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -36,13 +37,14 @@ - * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block - * and the following sub-blocks' literals sections will be Treeless_Literals_Block. - * @return : compressed size of literals section of a sub-block -- * Or 0 if it unable to compress. -+ * Or 0 if unable to compress. - * Or error code */ --static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, -- const ZSTD_hufCTablesMetadata_t* hufMetadata, -- const BYTE* literals, size_t litSize, -- void* dst, size_t dstSize, -- const int bmi2, int writeEntropy, int* entropyWritten) -+static size_t -+ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, -+ const ZSTD_hufCTablesMetadata_t* hufMetadata, -+ const BYTE* literals, size_t litSize, -+ void* dst, size_t dstSize, -+ const int bmi2, int writeEntropy, int* entropyWritten) - { - size_t const header = writeEntropy ? 200 : 0; - size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); -@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, - symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; - size_t cLitSize = 0; - -- (void)bmi2; /* TODO bmi2... */ -- - DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy); - - *entropyWritten = 0; -@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, - DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize); - } - -- /* TODO bmi2 */ -- { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable) -- : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable); -+ { int const flags = bmi2 ? HUF_flags_bmi2 : 0; -+ const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags) -+ : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags); - op += cSize; - cLitSize += cSize; - if (cSize == 0 || ERR_isError(cSize)) { -@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, - switch(lhSize) - { - case 3: /* 2 - 2 - 10 - 10 */ -- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); -+ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); - MEM_writeLE24(ostart, lhc); - break; - } -@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, - } - *entropyWritten = 1; - DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart)); -- return op-ostart; -+ return (size_t)(op-ostart); - } - --static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) { -- const seqDef* const sstart = sequences; -- const seqDef* const send = sequences + nbSeq; -- const seqDef* sp = sstart; -+static size_t -+ZSTD_seqDecompressedSize(seqStore_t const* seqStore, -+ const seqDef* sequences, size_t nbSeqs, -+ size_t litSize, int lastSubBlock) -+{ - size_t matchLengthSum = 0; - size_t litLengthSum = 0; -- (void)(litLengthSum); /* suppress unused variable warning on some environments */ -- while (send-sp > 0) { -- ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); -+ size_t n; -+ for (n=0; ncParams.windowLog > STREAM_ACCUMULATOR_MIN; - BYTE* const ostart = (BYTE*)dst; -@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables - /* Sequences Header */ - RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, - dstSize_tooSmall, ""); -- if (nbSeq < 0x7F) -+ if (nbSeq < 128) - *op++ = (BYTE)nbSeq; - else if (nbSeq < LONGNBSEQ) - op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; - else - op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; - if (nbSeq==0) { -- return op - ostart; -+ return (size_t)(op - ostart); - } - - /* seqHead : flags for FSE encoding type */ -@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables - } - - { size_t const bitstreamSize = ZSTD_encodeSequences( -- op, oend - op, -+ op, (size_t)(oend - op), - fseTables->matchlengthCTable, mlCode, - fseTables->offcodeCTable, ofCode, - fseTables->litlengthCTable, llCode, -@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables - #endif - - *entropyWritten = 1; -- return op - ostart; -+ return (size_t)(op - ostart); - } - - /* ZSTD_compressSubBlock() : -@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, - litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock); - { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable, - &entropyMetadata->hufMetadata, literals, litSize, -- op, oend-op, bmi2, writeLitEntropy, litEntropyWritten); -+ op, (size_t)(oend-op), -+ bmi2, writeLitEntropy, litEntropyWritten); - FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed"); - if (cLitSize == 0) return 0; - op += cLitSize; -@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, - sequences, nbSeq, - llCode, mlCode, ofCode, - cctxParams, -- op, oend-op, -+ op, (size_t)(oend-op), - bmi2, writeSeqEntropy, seqEntropyWritten); - FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed"); - if (cSeqSize == 0) return 0; - op += cSeqSize; - } - /* Write block header */ -- { size_t cSize = (op-ostart)-ZSTD_blockHeaderSize; -+ { size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize; - U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); - MEM_writeLE24(ostart, cBlockHeader24); - } -- return op-ostart; -+ return (size_t)(op-ostart); - } - - static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, -@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable, - return cSeqSizeEstimate + sequencesSectionHeaderSize; - } - --static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, -+typedef struct { -+ size_t estLitSize; -+ size_t estBlockSize; -+} EstimatedBlockSize; -+static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, - const BYTE* ofCodeTable, - const BYTE* llCodeTable, - const BYTE* mlCodeTable, -@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, - const ZSTD_entropyCTables_t* entropy, - const ZSTD_entropyCTablesMetadata_t* entropyMetadata, - void* workspace, size_t wkspSize, -- int writeLitEntropy, int writeSeqEntropy) { -- size_t cSizeEstimate = 0; -- cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize, -- &entropy->huf, &entropyMetadata->hufMetadata, -- workspace, wkspSize, writeLitEntropy); -- cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, -+ int writeLitEntropy, int writeSeqEntropy) -+{ -+ EstimatedBlockSize ebs; -+ ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize, -+ &entropy->huf, &entropyMetadata->hufMetadata, -+ workspace, wkspSize, writeLitEntropy); -+ ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, - nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, - workspace, wkspSize, writeSeqEntropy); -- return cSizeEstimate + ZSTD_blockHeaderSize; -+ ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize; -+ return ebs; - } - - static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata) -@@ -415,13 +427,56 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe - return 0; - } - -+static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount) -+{ -+ size_t n, total = 0; -+ assert(sp != NULL); -+ for (n=0; n %zu bytes", seqCount, (const void*)sp, total); -+ return total; -+} -+ -+#define BYTESCALE 256 -+ -+static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs, -+ size_t targetBudget, size_t avgLitCost, size_t avgSeqCost, -+ int firstSubBlock) -+{ -+ size_t n, budget = 0, inSize=0; -+ /* entropy headers */ -+ size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */ -+ assert(firstSubBlock==0 || firstSubBlock==1); -+ budget += headerSize; -+ -+ /* first sequence => at least one sequence*/ -+ budget += sp[0].litLength * avgLitCost + avgSeqCost; -+ if (budget > targetBudget) return 1; -+ inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH); -+ -+ /* loop over sequences */ -+ for (n=1; n targetBudget) -+ /* though continue to expand until the sub-block is deemed compressible */ -+ && (budget < inSize * BYTESCALE) ) -+ break; -+ } -+ -+ return n; -+} -+ - /* ZSTD_compressSubBlock_multi() : - * Breaks super-block into multiple sub-blocks and compresses them. -- * Entropy will be written to the first block. -- * The following blocks will use repeat mode to compress. -- * All sub-blocks are compressed blocks (no raw or rle blocks). -- * @return : compressed size of the super block (which is multiple ZSTD blocks) -- * Or 0 if it failed to compress. */ -+ * Entropy will be written into the first block. -+ * The following blocks use repeat_mode to compress. -+ * Sub-blocks are all compressed, except the last one when beneficial. -+ * @return : compressed size of the super block (which features multiple ZSTD blocks) -+ * or 0 if it failed to compress. */ - static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, - const ZSTD_compressedBlockState_t* prevCBlock, - ZSTD_compressedBlockState_t* nextCBlock, -@@ -434,10 +489,12 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, - { - const seqDef* const sstart = seqStorePtr->sequencesStart; - const seqDef* const send = seqStorePtr->sequences; -- const seqDef* sp = sstart; -+ const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */ -+ size_t const nbSeqs = (size_t)(send - sstart); - const BYTE* const lstart = seqStorePtr->litStart; - const BYTE* const lend = seqStorePtr->lit; - const BYTE* lp = lstart; -+ size_t const nbLiterals = (size_t)(lend - lstart); - BYTE const* ip = (BYTE const*)src; - BYTE const* const iend = ip + srcSize; - BYTE* const ostart = (BYTE*)dst; -@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, - const BYTE* llCodePtr = seqStorePtr->llCode; - const BYTE* mlCodePtr = seqStorePtr->mlCode; - const BYTE* ofCodePtr = seqStorePtr->ofCode; -- size_t targetCBlockSize = cctxParams->targetCBlockSize; -- size_t litSize, seqCount; -- int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed; -+ size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */ -+ size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize); -+ int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed); - int writeSeqEntropy = 1; -- int lastSequence = 0; -- -- DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)", -- (unsigned)(lend-lp), (unsigned)(send-sstart)); -- -- litSize = 0; -- seqCount = 0; -- do { -- size_t cBlockSizeEstimate = 0; -- if (sstart == send) { -- lastSequence = 1; -- } else { -- const seqDef* const sequence = sp + seqCount; -- lastSequence = sequence == send - 1; -- litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength; -- seqCount++; -- } -- if (lastSequence) { -- assert(lp <= lend); -- assert(litSize <= (size_t)(lend - lp)); -- litSize = (size_t)(lend - lp); -+ -+ DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)", -+ (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart)); -+ -+ /* let's start by a general estimation for the full block */ -+ if (nbSeqs > 0) { -+ EstimatedBlockSize const ebs = -+ ZSTD_estimateSubBlockSize(lp, nbLiterals, -+ ofCodePtr, llCodePtr, mlCodePtr, nbSeqs, -+ &nextCBlock->entropy, entropyMetadata, -+ workspace, wkspSize, -+ writeLitEntropy, writeSeqEntropy); -+ /* quick estimation */ -+ size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE; -+ size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs; -+ const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1); -+ size_t n, avgBlockBudget, blockBudgetSupp=0; -+ avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks; -+ DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes", -+ (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE, -+ (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE); -+ /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately -+ * this will result in the production of a single uncompressed block covering @srcSize.*/ -+ if (ebs.estBlockSize > srcSize) return 0; -+ -+ /* compress and write sub-blocks */ -+ assert(nbSubBlocks>0); -+ for (n=0; n < nbSubBlocks-1; n++) { -+ /* determine nb of sequences for current sub-block + nbLiterals from next sequence */ -+ size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp), -+ avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0); -+ /* if reached last sequence : break to last sub-block (simplification) */ -+ assert(seqCount <= (size_t)(send-sp)); -+ if (sp + seqCount == send) break; -+ assert(seqCount > 0); -+ /* compress sub-block */ -+ { int litEntropyWritten = 0; -+ int seqEntropyWritten = 0; -+ size_t litSize = countLiterals(seqStorePtr, sp, seqCount); -+ const size_t decompressedSize = -+ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0); -+ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, -+ sp, seqCount, -+ lp, litSize, -+ llCodePtr, mlCodePtr, ofCodePtr, -+ cctxParams, -+ op, (size_t)(oend-op), -+ bmi2, writeLitEntropy, writeSeqEntropy, -+ &litEntropyWritten, &seqEntropyWritten, -+ 0); -+ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); -+ -+ /* check compressibility, update state components */ -+ if (cSize > 0 && cSize < decompressedSize) { -+ DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes", -+ (unsigned)decompressedSize, (unsigned)cSize); -+ assert(ip + decompressedSize <= iend); -+ ip += decompressedSize; -+ lp += litSize; -+ op += cSize; -+ llCodePtr += seqCount; -+ mlCodePtr += seqCount; -+ ofCodePtr += seqCount; -+ /* Entropy only needs to be written once */ -+ if (litEntropyWritten) { -+ writeLitEntropy = 0; -+ } -+ if (seqEntropyWritten) { -+ writeSeqEntropy = 0; -+ } -+ sp += seqCount; -+ blockBudgetSupp = 0; -+ } } -+ /* otherwise : do not compress yet, coalesce current sub-block with following one */ - } -- /* I think there is an optimization opportunity here. -- * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful -- * since it recalculates estimate from scratch. -- * For example, it would recount literal distribution and symbol codes every time. -- */ -- cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, -- &nextCBlock->entropy, entropyMetadata, -- workspace, wkspSize, writeLitEntropy, writeSeqEntropy); -- if (cBlockSizeEstimate > targetCBlockSize || lastSequence) { -- int litEntropyWritten = 0; -- int seqEntropyWritten = 0; -- const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence); -- const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, -- sp, seqCount, -- lp, litSize, -- llCodePtr, mlCodePtr, ofCodePtr, -- cctxParams, -- op, oend-op, -- bmi2, writeLitEntropy, writeSeqEntropy, -- &litEntropyWritten, &seqEntropyWritten, -- lastBlock && lastSequence); -- FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); -- if (cSize > 0 && cSize < decompressedSize) { -- DEBUGLOG(5, "Committed the sub-block"); -- assert(ip + decompressedSize <= iend); -- ip += decompressedSize; -- sp += seqCount; -- lp += litSize; -- op += cSize; -- llCodePtr += seqCount; -- mlCodePtr += seqCount; -- ofCodePtr += seqCount; -- litSize = 0; -- seqCount = 0; -- /* Entropy only needs to be written once */ -- if (litEntropyWritten) { -- writeLitEntropy = 0; -- } -- if (seqEntropyWritten) { -- writeSeqEntropy = 0; -- } -+ } /* if (nbSeqs > 0) */ -+ -+ /* write last block */ -+ DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp)); -+ { int litEntropyWritten = 0; -+ int seqEntropyWritten = 0; -+ size_t litSize = (size_t)(lend - lp); -+ size_t seqCount = (size_t)(send - sp); -+ const size_t decompressedSize = -+ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1); -+ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, -+ sp, seqCount, -+ lp, litSize, -+ llCodePtr, mlCodePtr, ofCodePtr, -+ cctxParams, -+ op, (size_t)(oend-op), -+ bmi2, writeLitEntropy, writeSeqEntropy, -+ &litEntropyWritten, &seqEntropyWritten, -+ lastBlock); -+ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); -+ -+ /* update pointers, the nb of literals borrowed from next sequence must be preserved */ -+ if (cSize > 0 && cSize < decompressedSize) { -+ DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes", -+ (unsigned)decompressedSize, (unsigned)cSize); -+ assert(ip + decompressedSize <= iend); -+ ip += decompressedSize; -+ lp += litSize; -+ op += cSize; -+ llCodePtr += seqCount; -+ mlCodePtr += seqCount; -+ ofCodePtr += seqCount; -+ /* Entropy only needs to be written once */ -+ if (litEntropyWritten) { -+ writeLitEntropy = 0; - } -+ if (seqEntropyWritten) { -+ writeSeqEntropy = 0; -+ } -+ sp += seqCount; - } -- } while (!lastSequence); -+ } -+ -+ - if (writeLitEntropy) { -- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten"); -+ DEBUGLOG(5, "Literal entropy tables were never written"); - ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf)); - } - if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) { - /* If we haven't written our entropy tables, then we've violated our contract and - * must emit an uncompressed block. - */ -- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten"); -+ DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block"); - return 0; - } -+ - if (ip < iend) { -- size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock); -- DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip)); -+ /* some data left : last part of the block sent uncompressed */ -+ size_t const rSize = (size_t)((iend - ip)); -+ size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock); -+ DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize)); - FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); - assert(cSize != 0); - op += cSize; - /* We have to regenerate the repcodes because we've skipped some sequences */ - if (sp < send) { -- seqDef const* seq; -+ const seqDef* seq; - repcodes_t rep; - ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); - for (seq = sstart; seq < sp; ++seq) { -- ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); -+ ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); - } - ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); - } - } -- DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed"); -- return op-ostart; -+ -+ DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u", -+ (unsigned)(op-ostart)); -+ return (size_t)(op-ostart); - } - - size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, - void* dst, size_t dstCapacity, -- void const* src, size_t srcSize, -- unsigned lastBlock) { -+ const void* src, size_t srcSize, -+ unsigned lastBlock) -+{ - ZSTD_entropyCTablesMetadata_t entropyMetadata; - - FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, -diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h -index 224ece79546e..826bbc9e029b 100644 ---- a/lib/zstd/compress/zstd_compress_superblock.h -+++ b/lib/zstd/compress/zstd_compress_superblock.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h -index 349fc923c355..86bc3c2c23c7 100644 ---- a/lib/zstd/compress/zstd_cwksp.h -+++ b/lib/zstd/compress/zstd_cwksp.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -14,7 +15,9 @@ - /*-************************************* - * Dependencies - ***************************************/ -+#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ - #include "../common/zstd_internal.h" -+#include "../common/portability_macros.h" - - - /*-************************************* -@@ -41,8 +44,9 @@ - ***************************************/ - typedef enum { - ZSTD_cwksp_alloc_objects, -- ZSTD_cwksp_alloc_buffers, -- ZSTD_cwksp_alloc_aligned -+ ZSTD_cwksp_alloc_aligned_init_once, -+ ZSTD_cwksp_alloc_aligned, -+ ZSTD_cwksp_alloc_buffers - } ZSTD_cwksp_alloc_phase_e; - - /* -@@ -95,8 +99,8 @@ typedef enum { - * - * Workspace Layout: - * -- * [ ... workspace ... ] -- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] -+ * [ ... workspace ... ] -+ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once] - * - * The various objects that live in the workspace are divided into the - * following categories, and are allocated separately: -@@ -120,9 +124,18 @@ typedef enum { - * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). - * Their sizes depend on the cparams. These tables are 64-byte aligned. - * -- * - Aligned: these buffers are used for various purposes that require 4 byte -- * alignment, but don't require any initialization before they're used. These -- * buffers are each aligned to 64 bytes. -+ * - Init once: these buffers require to be initialized at least once before -+ * use. They should be used when we want to skip memory initialization -+ * while not triggering memory checkers (like Valgrind) when reading from -+ * from this memory without writing to it first. -+ * These buffers should be used carefully as they might contain data -+ * from previous compressions. -+ * Buffers are aligned to 64 bytes. -+ * -+ * - Aligned: these buffers don't require any initialization before they're -+ * used. The user of the buffer should make sure they write into a buffer -+ * location before reading from it. -+ * Buffers are aligned to 64 bytes. - * - * - Buffers: these buffers are used for various purposes that don't require - * any alignment or initialization before they're used. This means they can -@@ -134,8 +147,9 @@ typedef enum { - * correctly packed into the workspace buffer. That order is: - * - * 1. Objects -- * 2. Buffers -- * 3. Aligned/Tables -+ * 2. Init once / Tables -+ * 3. Aligned / Tables -+ * 4. Buffers / Tables - * - * Attempts to reserve objects of different types out of order will fail. - */ -@@ -147,6 +161,7 @@ typedef struct { - void* tableEnd; - void* tableValidEnd; - void* allocStart; -+ void* initOnceStart; - - BYTE allocFailed; - int workspaceOversizedDuration; -@@ -159,6 +174,7 @@ typedef struct { - ***************************************/ - - MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); -+MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws); - - MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { - (void)ws; -@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { - assert(ws->tableEnd <= ws->allocStart); - assert(ws->tableValidEnd <= ws->allocStart); - assert(ws->allocStart <= ws->workspaceEnd); -+ assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); -+ assert(ws->workspace <= ws->initOnceStart); - } - - /* -@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { - * for internal purposes (currently only alignment). - */ - MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { -- /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes -- * to align the beginning of tables section, as well as another n_2=[0, 63] bytes -- * to align the beginning of the aligned section. -- * -- * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and -- * aligneds being sized in multiples of 64 bytes. -+ /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES -+ * bytes to align the beginning of tables section and end of buffers; - */ -- size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; -+ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2; - return slackSpace; - } - -@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt - size_t const alignBytesMask = alignBytes - 1; - size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; - assert((alignBytes & alignBytesMask) == 0); -- assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); -+ assert(bytes < alignBytes); - return bytes; - } - -+/* -+ * Returns the initial value for allocStart which is used to determine the position from -+ * which we can allocate from the end of the workspace. -+ */ -+MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) { -+ return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1)); -+} -+ - /* - * Internal function. Do not use directly. - * Reserves the given number of bytes within the aligned/buffer segment of the wksp, -@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase - { - assert(phase >= ws->phase); - if (phase > ws->phase) { -- /* Going from allocating objects to allocating buffers */ -- if (ws->phase < ZSTD_cwksp_alloc_buffers && -- phase >= ZSTD_cwksp_alloc_buffers) { -+ /* Going from allocating objects to allocating initOnce / tables */ -+ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && -+ phase >= ZSTD_cwksp_alloc_aligned_init_once) { - ws->tableValidEnd = ws->objectEnd; -- } -+ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); - -- /* Going from allocating buffers to allocating aligneds/tables */ -- if (ws->phase < ZSTD_cwksp_alloc_aligned && -- phase >= ZSTD_cwksp_alloc_aligned) { -- { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ -- size_t const bytesToAlign = -- ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); -- DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); -- ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ -- RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), -- memory_allocation, "aligned phase - alignment initial allocation failed!"); -- } - { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ -- void* const alloc = ws->objectEnd; -+ void *const alloc = ws->objectEnd; - size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); -- void* const objectEnd = (BYTE*)alloc + bytesToAlign; -+ void *const objectEnd = (BYTE *) alloc + bytesToAlign; - DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); - RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, - "table phase - alignment initial allocation failed!"); -@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase - ws->tableEnd = objectEnd; /* table area starts being empty */ - if (ws->tableValidEnd < ws->tableEnd) { - ws->tableValidEnd = ws->tableEnd; -- } } } -+ } -+ } -+ } - ws->phase = phase; - ZSTD_cwksp_assert_internal_consistency(ws); - } -@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase - */ - MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) - { -- return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); -+ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd); - } - - /* -@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) - return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); - } - -+/* -+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). -+ * This memory has been initialized at least once in the past. -+ * This doesn't mean it has been initialized this time, and it might contain data from previous -+ * operations. -+ * The main usage is for algorithms that might need read access into uninitialized memory. -+ * The algorithm must maintain safety under these conditions and must make sure it doesn't -+ * leak any of the past data (directly or in side channels). -+ */ -+MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) -+{ -+ size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); -+ void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); -+ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); -+ if(ptr && ptr < ws->initOnceStart) { -+ /* We assume the memory following the current allocation is either: -+ * 1. Not usable as initOnce memory (end of workspace) -+ * 2. Another initOnce buffer that has been allocated before (and so was previously memset) -+ * 3. An ASAN redzone, in which case we don't want to write on it -+ * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. -+ * Note that we assume here that MSAN and ASAN cannot run in the same time. */ -+ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); -+ ws->initOnceStart = ptr; -+ } -+ return ptr; -+} -+ - /* - * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). - */ -@@ -356,18 +396,22 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) - - /* - * Aligned on 64 bytes. These buffers have the special property that -- * their values remain constrained, allowing us to re-use them without -+ * their values remain constrained, allowing us to reuse them without - * memset()-ing them. - */ - MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) - { -- const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; -+ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once; - void* alloc; - void* end; - void* top; - -- if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { -- return NULL; -+ /* We can only start allocating tables after we are done reserving space for objects at the -+ * start of the workspace */ -+ if(ws->phase < phase) { -+ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { -+ return NULL; -+ } - } - alloc = ws->tableEnd; - end = (BYTE *)alloc + bytes; -@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { - assert(ws->tableValidEnd >= ws->objectEnd); - assert(ws->tableValidEnd <= ws->allocStart); - if (ws->tableValidEnd < ws->tableEnd) { -- ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd); -+ ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd)); - } - ZSTD_cwksp_mark_tables_clean(ws); - } -@@ -478,14 +522,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { - - - ws->tableEnd = ws->objectEnd; -- ws->allocStart = ws->workspaceEnd; -+ ws->allocStart = ZSTD_cwksp_initialAllocStart(ws); - ws->allocFailed = 0; -- if (ws->phase > ZSTD_cwksp_alloc_buffers) { -- ws->phase = ZSTD_cwksp_alloc_buffers; -+ if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) { -+ ws->phase = ZSTD_cwksp_alloc_aligned_init_once; - } - ZSTD_cwksp_assert_internal_consistency(ws); - } - -+MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { -+ return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); -+} -+ -+MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { -+ return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) -+ + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); -+} -+ - /* - * The provided workspace takes ownership of the buffer [start, start+size). - * Any existing values in the workspace are ignored (the previously managed -@@ -498,6 +551,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c - ws->workspaceEnd = (BYTE*)start + size; - ws->objectEnd = ws->workspace; - ws->tableValidEnd = ws->objectEnd; -+ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); - ws->phase = ZSTD_cwksp_alloc_objects; - ws->isStatic = isStatic; - ZSTD_cwksp_clear(ws); -@@ -529,15 +583,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) { - ZSTD_memset(src, 0, sizeof(ZSTD_cwksp)); - } - --MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { -- return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); --} -- --MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { -- return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) -- + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); --} -- - MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { - return ws->allocFailed; - } -@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { - * Returns if the estimated space needed for a wksp is within an acceptable limit of the - * actual amount of space used. - */ --MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, -- size_t const estimatedSpace, int resizedWorkspace) { -- if (resizedWorkspace) { -- /* Resized/newly allocated wksp should have exact bounds */ -- return ZSTD_cwksp_used(ws) == estimatedSpace; -- } else { -- /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes -- * than estimatedSpace. See the comments in zstd_cwksp.h for details. -- */ -- return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); -- } -+MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) { -+ /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice -+ * the alignment bytes difference between estimation and actual usage */ -+ return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) && -+ ZSTD_cwksp_used(ws) <= estimatedSpace; - } - - -diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c -index 76933dea2624..5ff54f17d92f 100644 ---- a/lib/zstd/compress/zstd_double_fast.c -+++ b/lib/zstd/compress/zstd_double_fast.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -11,8 +12,49 @@ - #include "zstd_compress_internal.h" - #include "zstd_double_fast.h" - -+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR - --void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms, -+ void const* end, ZSTD_dictTableLoadMethod_e dtlm) -+{ -+ const ZSTD_compressionParameters* const cParams = &ms->cParams; -+ U32* const hashLarge = ms->hashTable; -+ U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; -+ U32 const mls = cParams->minMatch; -+ U32* const hashSmall = ms->chainTable; -+ U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; -+ const BYTE* const base = ms->window.base; -+ const BYTE* ip = base + ms->nextToUpdate; -+ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; -+ const U32 fastHashFillStep = 3; -+ -+ /* Always insert every fastHashFillStep position into the hash tables. -+ * Insert the other positions into the large hash table if their entry -+ * is empty. -+ */ -+ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { -+ U32 const curr = (U32)(ip - base); -+ U32 i; -+ for (i = 0; i < fastHashFillStep; ++i) { -+ size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls); -+ size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8); -+ if (i == 0) { -+ ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i); -+ } -+ if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { -+ ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i); -+ } -+ /* Only load extra positions for ZSTD_dtlm_full */ -+ if (dtlm == ZSTD_dtlm_fast) -+ break; -+ } } -+} -+ -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms, - void const* end, ZSTD_dictTableLoadMethod_e dtlm) - { - const ZSTD_compressionParameters* const cParams = &ms->cParams; -@@ -43,11 +85,24 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, - /* Only load extra positions for ZSTD_dtlm_full */ - if (dtlm == ZSTD_dtlm_fast) - break; -- } } -+ } } -+} -+ -+void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, -+ const void* const end, -+ ZSTD_dictTableLoadMethod_e dtlm, -+ ZSTD_tableFillPurpose_e tfp) -+{ -+ if (tfp == ZSTD_tfp_forCDict) { -+ ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm); -+ } else { -+ ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm); -+ } - } - - - FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - size_t ZSTD_compressBlock_doubleFast_noDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, U32 const mls /* template */) -@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - HASH_READ_SIZE; - U32 offset_1=rep[0], offset_2=rep[1]; -- U32 offsetSaved = 0; -+ U32 offsetSaved1 = 0, offsetSaved2 = 0; - - size_t mLength; - U32 offset; -@@ -100,8 +155,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( - U32 const current = (U32)(ip - base); - U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); - U32 const maxRep = current - windowLow; -- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; -- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; -+ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; -+ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; - } - - /* Outer Loop: one iteration per match found and stored */ -@@ -131,7 +186,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( - if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { - mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; - ip++; -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); -+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); - goto _match_stored; - } - -@@ -175,9 +230,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( - } while (ip1 <= ilimit); - - _cleanup: -+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), -+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ -+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; -+ - /* save reps for next block */ -- rep[0] = offset_1 ? offset_1 : offsetSaved; -- rep[1] = offset_2 ? offset_2 : offsetSaved; -+ rep[0] = offset_1 ? offset_1 : offsetSaved1; -+ rep[1] = offset_2 ? offset_2 : offsetSaved2; - - /* Return the last literals size */ - return (size_t)(iend - anchor); -@@ -217,7 +276,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( - hashLong[hl1] = (U32)(ip1 - base); - } - -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); -+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); - - _match_stored: - /* match found */ -@@ -243,7 +302,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( - U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); -- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength); -+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength); - ip += rLength; - anchor = ip; - continue; /* faster when present ... (?) */ -@@ -254,6 +313,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( - - - FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, -@@ -275,7 +335,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - HASH_READ_SIZE; - U32 offset_1=rep[0], offset_2=rep[1]; -- U32 offsetSaved = 0; - - const ZSTD_matchState_t* const dms = ms->dictMatchState; - const ZSTD_compressionParameters* const dictCParams = &dms->cParams; -@@ -286,8 +345,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - const BYTE* const dictStart = dictBase + dictStartIndex; - const BYTE* const dictEnd = dms->window.nextSrc; - const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); -- const U32 dictHBitsL = dictCParams->hashLog; -- const U32 dictHBitsS = dictCParams->chainLog; -+ const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; -+ const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; - const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); - - DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); -@@ -295,6 +354,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - /* if a dictionary is attached, it must be within window range */ - assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); - -+ if (ms->prefetchCDictTables) { -+ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); -+ size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32); -+ PREFETCH_AREA(dictHashLong, hashTableBytes); -+ PREFETCH_AREA(dictHashSmall, chainTableBytes); -+ } -+ - /* init */ - ip += (dictAndPrefixLength == 0); - -@@ -309,8 +375,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - U32 offset; - size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); - size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); -- size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8); -- size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls); -+ size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8); -+ size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls); -+ U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS]; -+ U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS]; -+ int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL); -+ int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS); - U32 const curr = (U32)(ip-base); - U32 const matchIndexL = hashLong[h2]; - U32 matchIndexS = hashSmall[h]; -@@ -328,7 +398,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; - mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; - ip++; -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); -+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); - goto _match_stored; - } - -@@ -340,9 +410,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ - goto _match_found; - } -- } else { -+ } else if (dictTagsMatchL) { - /* check dictMatchState long match */ -- U32 const dictMatchIndexL = dictHashLong[dictHL]; -+ U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS; - const BYTE* dictMatchL = dictBase + dictMatchIndexL; - assert(dictMatchL < dictEnd); - -@@ -358,9 +428,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - if (MEM_read32(match) == MEM_read32(ip)) { - goto _search_next_long; - } -- } else { -+ } else if (dictTagsMatchS) { - /* check dictMatchState short match */ -- U32 const dictMatchIndexS = dictHashSmall[dictHS]; -+ U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS; - match = dictBase + dictMatchIndexS; - matchIndexS = dictMatchIndexS + dictIndexDelta; - -@@ -375,10 +445,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - continue; - - _search_next_long: -- - { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); -- size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8); -+ size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8); - U32 const matchIndexL3 = hashLong[hl3]; -+ U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS]; -+ int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3); - const BYTE* matchL3 = base + matchIndexL3; - hashLong[hl3] = curr + 1; - -@@ -391,9 +462,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ - goto _match_found; - } -- } else { -+ } else if (dictTagsMatchL3) { - /* check dict long +1 match */ -- U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; -+ U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS; - const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; - assert(dictMatchL3 < dictEnd); - if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { -@@ -419,7 +490,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - offset_2 = offset_1; - offset_1 = offset; - -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); -+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); - - _match_stored: - /* match found */ -@@ -448,7 +519,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; - size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; - U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ -- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); -+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; - ip += repLength2; -@@ -461,8 +532,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( - } /* while (ip < ilimit) */ - - /* save reps for next block */ -- rep[0] = offset_1 ? offset_1 : offsetSaved; -- rep[1] = offset_2 ? offset_2 : offsetSaved; -+ rep[0] = offset_1; -+ rep[1] = offset_2; - - /* Return the last literals size */ - return (size_t)(iend - anchor); -@@ -527,7 +598,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState( - } - - --static size_t ZSTD_compressBlock_doubleFast_extDict_generic( -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t ZSTD_compressBlock_doubleFast_extDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, - U32 const mls /* template */) -@@ -585,7 +658,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( - const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; - mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; - ip++; -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); -+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); - } else { - if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { - const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; -@@ -596,7 +669,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( - while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ - offset_2 = offset_1; - offset_1 = offset; -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); -+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); - - } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { - size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); -@@ -621,7 +694,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( - } - offset_2 = offset_1; - offset_1 = offset; -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); -+ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); - - } else { - ip += ((ip-anchor) >> kSearchStrength) + 1; -@@ -653,7 +726,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( - const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; - size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; - U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ -- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); -+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; - ip += repLength2; -@@ -694,3 +767,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict( - return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); - } - } -+ -+#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ -diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h -index 6822bde65a1d..b7ddc714f13e 100644 ---- a/lib/zstd/compress/zstd_double_fast.h -+++ b/lib/zstd/compress/zstd_double_fast.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -15,8 +16,12 @@ - #include "../common/mem.h" /* U32 */ - #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ - -+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR -+ - void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, -- void const* end, ZSTD_dictTableLoadMethod_e dtlm); -+ void const* end, ZSTD_dictTableLoadMethod_e dtlm, -+ ZSTD_tableFillPurpose_e tfp); -+ - size_t ZSTD_compressBlock_doubleFast( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); -@@ -27,6 +32,14 @@ size_t ZSTD_compressBlock_doubleFast_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); - -+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast -+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState -+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict -+#else -+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL -+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL -+#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL -+#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ - - - #endif /* ZSTD_DOUBLE_FAST_H */ -diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c -index a752e6beab52..b7a63ba4ce56 100644 ---- a/lib/zstd/compress/zstd_fast.c -+++ b/lib/zstd/compress/zstd_fast.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -11,8 +12,46 @@ - #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */ - #include "zstd_fast.h" - -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, -+ const void* const end, -+ ZSTD_dictTableLoadMethod_e dtlm) -+{ -+ const ZSTD_compressionParameters* const cParams = &ms->cParams; -+ U32* const hashTable = ms->hashTable; -+ U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; -+ U32 const mls = cParams->minMatch; -+ const BYTE* const base = ms->window.base; -+ const BYTE* ip = base + ms->nextToUpdate; -+ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; -+ const U32 fastHashFillStep = 3; - --void ZSTD_fillHashTable(ZSTD_matchState_t* ms, -+ /* Currently, we always use ZSTD_dtlm_full for filling CDict tables. -+ * Feel free to remove this assert if there's a good reason! */ -+ assert(dtlm == ZSTD_dtlm_full); -+ -+ /* Always insert every fastHashFillStep position into the hash table. -+ * Insert the other positions if their hash entry is empty. -+ */ -+ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { -+ U32 const curr = (U32)(ip - base); -+ { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls); -+ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); } -+ -+ if (dtlm == ZSTD_dtlm_fast) continue; -+ /* Only load extra positions for ZSTD_dtlm_full */ -+ { U32 p; -+ for (p = 1; p < fastHashFillStep; ++p) { -+ size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); -+ if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ -+ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); -+ } } } } -+} -+ -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms, - const void* const end, - ZSTD_dictTableLoadMethod_e dtlm) - { -@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, - const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; - const U32 fastHashFillStep = 3; - -+ /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables. -+ * Feel free to remove this assert if there's a good reason! */ -+ assert(dtlm == ZSTD_dtlm_fast); -+ - /* Always insert every fastHashFillStep position into the hash table. - * Insert the other positions if their hash entry is empty. - */ -@@ -42,6 +85,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, - } } } } - } - -+void ZSTD_fillHashTable(ZSTD_matchState_t* ms, -+ const void* const end, -+ ZSTD_dictTableLoadMethod_e dtlm, -+ ZSTD_tableFillPurpose_e tfp) -+{ -+ if (tfp == ZSTD_tfp_forCDict) { -+ ZSTD_fillHashTableForCDict(ms, end, dtlm); -+ } else { -+ ZSTD_fillHashTableForCCtx(ms, end, dtlm); -+ } -+} -+ - - /* - * If you squint hard enough (and ignore repcodes), the search operation at any -@@ -89,8 +144,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, - * - * This is also the work we do at the beginning to enter the loop initially. - */ --FORCE_INLINE_TEMPLATE size_t --ZSTD_compressBlock_fast_noDict_generic( -+FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t ZSTD_compressBlock_fast_noDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, - U32 const mls, U32 const hasStep) -@@ -117,7 +173,7 @@ ZSTD_compressBlock_fast_noDict_generic( - - U32 rep_offset1 = rep[0]; - U32 rep_offset2 = rep[1]; -- U32 offsetSaved = 0; -+ U32 offsetSaved1 = 0, offsetSaved2 = 0; - - size_t hash0; /* hash for ip0 */ - size_t hash1; /* hash for ip1 */ -@@ -141,8 +197,8 @@ ZSTD_compressBlock_fast_noDict_generic( - { U32 const curr = (U32)(ip0 - base); - U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); - U32 const maxRep = curr - windowLow; -- if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0; -- if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0; -+ if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0; -+ if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0; - } - - /* start each op */ -@@ -180,8 +236,14 @@ ZSTD_compressBlock_fast_noDict_generic( - mLength = ip0[-1] == match0[-1]; - ip0 -= mLength; - match0 -= mLength; -- offcode = STORE_REPCODE_1; -+ offcode = REPCODE1_TO_OFFBASE; - mLength += 4; -+ -+ /* First write next hash table entry; we've already calculated it. -+ * This write is known to be safe because the ip1 is before the -+ * repcode (ip2). */ -+ hashTable[hash1] = (U32)(ip1 - base); -+ - goto _match; - } - -@@ -195,6 +257,12 @@ ZSTD_compressBlock_fast_noDict_generic( - /* check match at ip[0] */ - if (MEM_read32(ip0) == mval) { - /* found a match! */ -+ -+ /* First write next hash table entry; we've already calculated it. -+ * This write is known to be safe because the ip1 == ip0 + 1, so -+ * we know we will resume searching after ip1 */ -+ hashTable[hash1] = (U32)(ip1 - base); -+ - goto _offset; - } - -@@ -224,6 +292,21 @@ ZSTD_compressBlock_fast_noDict_generic( - /* check match at ip[0] */ - if (MEM_read32(ip0) == mval) { - /* found a match! */ -+ -+ /* first write next hash table entry; we've already calculated it */ -+ if (step <= 4) { -+ /* We need to avoid writing an index into the hash table >= the -+ * position at which we will pick up our searching after we've -+ * taken this match. -+ * -+ * The minimum possible match has length 4, so the earliest ip0 -+ * can be after we take this match will be the current ip0 + 4. -+ * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely -+ * write this position. -+ */ -+ hashTable[hash1] = (U32)(ip1 - base); -+ } -+ - goto _offset; - } - -@@ -254,9 +337,24 @@ ZSTD_compressBlock_fast_noDict_generic( - * However, it seems to be a meaningful performance hit to try to search - * them. So let's not. */ - -+ /* When the repcodes are outside of the prefix, we set them to zero before the loop. -+ * When the offsets are still zero, we need to restore them after the block to have a correct -+ * repcode history. If only one offset was invalid, it is easy. The tricky case is when both -+ * offsets were invalid. We need to figure out which offset to refill with. -+ * - If both offsets are zero they are in the same order. -+ * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`. -+ * - If only one is zero, we need to decide which offset to restore. -+ * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1. -+ * - It is impossible for rep_offset2 to be non-zero. -+ * -+ * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then -+ * set rep[0] = rep_offset1 and rep[1] = offsetSaved1. -+ */ -+ offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2; -+ - /* save reps for next block */ -- rep[0] = rep_offset1 ? rep_offset1 : offsetSaved; -- rep[1] = rep_offset2 ? rep_offset2 : offsetSaved; -+ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1; -+ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2; - - /* Return the last literals size */ - return (size_t)(iend - anchor); -@@ -267,7 +365,7 @@ ZSTD_compressBlock_fast_noDict_generic( - match0 = base + idx; - rep_offset2 = rep_offset1; - rep_offset1 = (U32)(ip0-match0); -- offcode = STORE_OFFSET(rep_offset1); -+ offcode = OFFSET_TO_OFFBASE(rep_offset1); - mLength = 4; - - /* Count the backwards match length. */ -@@ -287,11 +385,6 @@ ZSTD_compressBlock_fast_noDict_generic( - ip0 += mLength; - anchor = ip0; - -- /* write next hash table entry */ -- if (ip1 < ip0) { -- hashTable[hash1] = (U32)(ip1 - base); -- } -- - /* Fill table and check for immediate repcode. */ - if (ip0 <= ilimit) { - /* Fill Table */ -@@ -306,7 +399,7 @@ ZSTD_compressBlock_fast_noDict_generic( - { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ - hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); - ip0 += rLength; -- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength); -+ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength); - anchor = ip0; - continue; /* faster when present (confirmed on gcc-8) ... (?) */ - } } } -@@ -369,6 +462,7 @@ size_t ZSTD_compressBlock_fast( - } - - FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - size_t ZSTD_compressBlock_fast_dictMatchState_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, U32 const mls, U32 const hasStep) -@@ -380,14 +474,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( - U32 const stepSize = cParams->targetLength + !(cParams->targetLength); - const BYTE* const base = ms->window.base; - const BYTE* const istart = (const BYTE*)src; -- const BYTE* ip = istart; -+ const BYTE* ip0 = istart; -+ const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */ - const BYTE* anchor = istart; - const U32 prefixStartIndex = ms->window.dictLimit; - const BYTE* const prefixStart = base + prefixStartIndex; - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - HASH_READ_SIZE; - U32 offset_1=rep[0], offset_2=rep[1]; -- U32 offsetSaved = 0; - - const ZSTD_matchState_t* const dms = ms->dictMatchState; - const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; -@@ -397,13 +491,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( - const BYTE* const dictStart = dictBase + dictStartIndex; - const BYTE* const dictEnd = dms->window.nextSrc; - const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); -- const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); -- const U32 dictHLog = dictCParams->hashLog; -+ const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); -+ const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; - - /* if a dictionary is still attached, it necessarily means that - * it is within window size. So we just check it. */ - const U32 maxDistance = 1U << cParams->windowLog; -- const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); -+ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); - assert(endIndex - prefixStartIndex <= maxDistance); - (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ - -@@ -413,106 +507,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( - * when translating a dict index into a local index */ - assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); - -+ if (ms->prefetchCDictTables) { -+ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); -+ PREFETCH_AREA(dictHashTable, hashTableBytes); -+ } -+ - /* init */ - DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); -- ip += (dictAndPrefixLength == 0); -+ ip0 += (dictAndPrefixLength == 0); - /* dictMatchState repCode checks don't currently handle repCode == 0 - * disabling. */ - assert(offset_1 <= dictAndPrefixLength); - assert(offset_2 <= dictAndPrefixLength); - -- /* Main Search Loop */ -- while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ -+ /* Outer search loop */ -+ assert(stepSize >= 1); -+ while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ - size_t mLength; -- size_t const h = ZSTD_hashPtr(ip, hlog, mls); -- U32 const curr = (U32)(ip-base); -- U32 const matchIndex = hashTable[h]; -- const BYTE* match = base + matchIndex; -- const U32 repIndex = curr + 1 - offset_1; -- const BYTE* repMatch = (repIndex < prefixStartIndex) ? -- dictBase + (repIndex - dictIndexDelta) : -- base + repIndex; -- hashTable[h] = curr; /* update hash table */ -- -- if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ -- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { -- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; -- mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; -- ip++; -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); -- } else if ( (matchIndex <= prefixStartIndex) ) { -- size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); -- U32 const dictMatchIndex = dictHashTable[dictHash]; -- const BYTE* dictMatch = dictBase + dictMatchIndex; -- if (dictMatchIndex <= dictStartIndex || -- MEM_read32(dictMatch) != MEM_read32(ip)) { -- assert(stepSize >= 1); -- ip += ((ip-anchor) >> kSearchStrength) + stepSize; -- continue; -- } else { -- /* found a dict match */ -- U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta); -- mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; -- while (((ip>anchor) & (dictMatch>dictStart)) -- && (ip[-1] == dictMatch[-1])) { -- ip--; dictMatch--; mLength++; -+ size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); -+ -+ size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls); -+ U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS]; -+ int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0); -+ -+ U32 matchIndex = hashTable[hash0]; -+ U32 curr = (U32)(ip0 - base); -+ size_t step = stepSize; -+ const size_t kStepIncr = 1 << kSearchStrength; -+ const BYTE* nextStep = ip0 + kStepIncr; -+ -+ /* Inner search loop */ -+ while (1) { -+ const BYTE* match = base + matchIndex; -+ const U32 repIndex = curr + 1 - offset_1; -+ const BYTE* repMatch = (repIndex < prefixStartIndex) ? -+ dictBase + (repIndex - dictIndexDelta) : -+ base + repIndex; -+ const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); -+ size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); -+ hashTable[hash0] = curr; /* update hash table */ -+ -+ if (((U32) ((prefixStartIndex - 1) - repIndex) >= -+ 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ -+ && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { -+ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; -+ mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; -+ ip0++; -+ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); -+ break; -+ } -+ -+ if (dictTagsMatch) { -+ /* Found a possible dict match */ -+ const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; -+ const BYTE* dictMatch = dictBase + dictMatchIndex; -+ if (dictMatchIndex > dictStartIndex && -+ MEM_read32(dictMatch) == MEM_read32(ip0)) { -+ /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */ -+ if (matchIndex <= prefixStartIndex) { -+ U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); -+ mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4; -+ while (((ip0 > anchor) & (dictMatch > dictStart)) -+ && (ip0[-1] == dictMatch[-1])) { -+ ip0--; -+ dictMatch--; -+ mLength++; -+ } /* catch up */ -+ offset_2 = offset_1; -+ offset_1 = offset; -+ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); -+ break; -+ } -+ } -+ } -+ -+ if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) { -+ /* found a regular match */ -+ U32 const offset = (U32) (ip0 - match); -+ mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; -+ while (((ip0 > anchor) & (match > prefixStart)) -+ && (ip0[-1] == match[-1])) { -+ ip0--; -+ match--; -+ mLength++; - } /* catch up */ - offset_2 = offset_1; - offset_1 = offset; -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); -+ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); -+ break; - } -- } else if (MEM_read32(match) != MEM_read32(ip)) { -- /* it's not a match, and we're not going to check the dictionary */ -- assert(stepSize >= 1); -- ip += ((ip-anchor) >> kSearchStrength) + stepSize; -- continue; -- } else { -- /* found a regular match */ -- U32 const offset = (U32)(ip-match); -- mLength = ZSTD_count(ip+4, match+4, iend) + 4; -- while (((ip>anchor) & (match>prefixStart)) -- && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ -- offset_2 = offset_1; -- offset_1 = offset; -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); -- } -+ -+ /* Prepare for next iteration */ -+ dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS]; -+ dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1); -+ matchIndex = hashTable[hash1]; -+ -+ if (ip1 >= nextStep) { -+ step++; -+ nextStep += kStepIncr; -+ } -+ ip0 = ip1; -+ ip1 = ip1 + step; -+ if (ip1 > ilimit) goto _cleanup; -+ -+ curr = (U32)(ip0 - base); -+ hash0 = hash1; -+ } /* end inner search loop */ - - /* match found */ -- ip += mLength; -- anchor = ip; -+ assert(mLength); -+ ip0 += mLength; -+ anchor = ip0; - -- if (ip <= ilimit) { -+ if (ip0 <= ilimit) { - /* Fill Table */ - assert(base+curr+2 > istart); /* check base overflow */ - hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ -- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); -+ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); - - /* check immediate repcode */ -- while (ip <= ilimit) { -- U32 const current2 = (U32)(ip-base); -+ while (ip0 <= ilimit) { -+ U32 const current2 = (U32)(ip0-base); - U32 const repIndex2 = current2 - offset_2; - const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? - dictBase - dictIndexDelta + repIndex2 : - base + repIndex2; - if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) -- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { -+ && (MEM_read32(repMatch2) == MEM_read32(ip0))) { - const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; -- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; -+ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; - U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ -- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); -- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; -- ip += repLength2; -- anchor = ip; -+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); -+ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; -+ ip0 += repLength2; -+ anchor = ip0; - continue; - } - break; - } - } -+ -+ /* Prepare for next iteration */ -+ assert(ip0 == anchor); -+ ip1 = ip0 + stepSize; - } - -+_cleanup: - /* save reps for next block */ -- rep[0] = offset_1 ? offset_1 : offsetSaved; -- rep[1] = offset_2 ? offset_2 : offsetSaved; -+ rep[0] = offset_1; -+ rep[1] = offset_2; - - /* Return the last literals size */ - return (size_t)(iend - anchor); -@@ -545,7 +688,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState( - } - - --static size_t ZSTD_compressBlock_fast_extDict_generic( -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t ZSTD_compressBlock_fast_extDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, U32 const mls, U32 const hasStep) - { -@@ -553,11 +698,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( - U32* const hashTable = ms->hashTable; - U32 const hlog = cParams->hashLog; - /* support stepSize of 0 */ -- U32 const stepSize = cParams->targetLength + !(cParams->targetLength); -+ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; - const BYTE* const base = ms->window.base; - const BYTE* const dictBase = ms->window.dictBase; - const BYTE* const istart = (const BYTE*)src; -- const BYTE* ip = istart; - const BYTE* anchor = istart; - const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); - const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); -@@ -570,6 +714,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( - const BYTE* const iend = istart + srcSize; - const BYTE* const ilimit = iend - 8; - U32 offset_1=rep[0], offset_2=rep[1]; -+ U32 offsetSaved1 = 0, offsetSaved2 = 0; -+ -+ const BYTE* ip0 = istart; -+ const BYTE* ip1; -+ const BYTE* ip2; -+ const BYTE* ip3; -+ U32 current0; -+ -+ -+ size_t hash0; /* hash for ip0 */ -+ size_t hash1; /* hash for ip1 */ -+ U32 idx; /* match idx for ip0 */ -+ const BYTE* idxBase; /* base pointer for idx */ -+ -+ U32 offcode; -+ const BYTE* match0; -+ size_t mLength; -+ const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */ -+ -+ size_t step; -+ const BYTE* nextStep; -+ const size_t kStepIncr = (1 << (kSearchStrength - 1)); - - (void)hasStep; /* not currently specialized on whether it's accelerated */ - -@@ -579,75 +745,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( - if (prefixStartIndex == dictStartIndex) - return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); - -- /* Search Loop */ -- while (ip < ilimit) { /* < instead of <=, because (ip+1) */ -- const size_t h = ZSTD_hashPtr(ip, hlog, mls); -- const U32 matchIndex = hashTable[h]; -- const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; -- const BYTE* match = matchBase + matchIndex; -- const U32 curr = (U32)(ip-base); -- const U32 repIndex = curr + 1 - offset_1; -- const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; -- const BYTE* const repMatch = repBase + repIndex; -- hashTable[h] = curr; /* update hash table */ -- DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); -- -- if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ -- & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */ -- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { -- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; -- size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; -- ip++; -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength); -- ip += rLength; -- anchor = ip; -- } else { -- if ( (matchIndex < dictStartIndex) || -- (MEM_read32(match) != MEM_read32(ip)) ) { -- assert(stepSize >= 1); -- ip += ((ip-anchor) >> kSearchStrength) + stepSize; -- continue; -+ { U32 const curr = (U32)(ip0 - base); -+ U32 const maxRep = curr - dictStartIndex; -+ if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0; -+ if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0; -+ } -+ -+ /* start each op */ -+_start: /* Requires: ip0 */ -+ -+ step = stepSize; -+ nextStep = ip0 + kStepIncr; -+ -+ /* calculate positions, ip0 - anchor == 0, so we skip step calc */ -+ ip1 = ip0 + 1; -+ ip2 = ip0 + step; -+ ip3 = ip2 + 1; -+ -+ if (ip3 >= ilimit) { -+ goto _cleanup; -+ } -+ -+ hash0 = ZSTD_hashPtr(ip0, hlog, mls); -+ hash1 = ZSTD_hashPtr(ip1, hlog, mls); -+ -+ idx = hashTable[hash0]; -+ idxBase = idx < prefixStartIndex ? dictBase : base; -+ -+ do { -+ { /* load repcode match for ip[2] */ -+ U32 const current2 = (U32)(ip2 - base); -+ U32 const repIndex = current2 - offset_1; -+ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; -+ U32 rval; -+ if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */ -+ & (offset_1 > 0) ) { -+ rval = MEM_read32(repBase + repIndex); -+ } else { -+ rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */ - } -- { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; -- const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; -- U32 const offset = curr - matchIndex; -- size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; -- while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ -- offset_2 = offset_1; offset_1 = offset; /* update offset history */ -- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); -- ip += mLength; -- anchor = ip; -+ -+ /* write back hash table entry */ -+ current0 = (U32)(ip0 - base); -+ hashTable[hash0] = current0; -+ -+ /* check repcode at ip[2] */ -+ if (MEM_read32(ip2) == rval) { -+ ip0 = ip2; -+ match0 = repBase + repIndex; -+ matchEnd = repIndex < prefixStartIndex ? dictEnd : iend; -+ assert((match0 != prefixStart) & (match0 != dictStart)); -+ mLength = ip0[-1] == match0[-1]; -+ ip0 -= mLength; -+ match0 -= mLength; -+ offcode = REPCODE1_TO_OFFBASE; -+ mLength += 4; -+ goto _match; - } } - -- if (ip <= ilimit) { -- /* Fill Table */ -- hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; -- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); -- /* check immediate repcode */ -- while (ip <= ilimit) { -- U32 const current2 = (U32)(ip-base); -- U32 const repIndex2 = current2 - offset_2; -- const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; -- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */ -- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { -- const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; -- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; -- { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ -- ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2); -- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; -- ip += repLength2; -- anchor = ip; -- continue; -- } -- break; -- } } } -+ { /* load match for ip[0] */ -+ U32 const mval = idx >= dictStartIndex ? -+ MEM_read32(idxBase + idx) : -+ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ -+ -+ /* check match at ip[0] */ -+ if (MEM_read32(ip0) == mval) { -+ /* found a match! */ -+ goto _offset; -+ } } -+ -+ /* lookup ip[1] */ -+ idx = hashTable[hash1]; -+ idxBase = idx < prefixStartIndex ? dictBase : base; -+ -+ /* hash ip[2] */ -+ hash0 = hash1; -+ hash1 = ZSTD_hashPtr(ip2, hlog, mls); -+ -+ /* advance to next positions */ -+ ip0 = ip1; -+ ip1 = ip2; -+ ip2 = ip3; -+ -+ /* write back hash table entry */ -+ current0 = (U32)(ip0 - base); -+ hashTable[hash0] = current0; -+ -+ { /* load match for ip[0] */ -+ U32 const mval = idx >= dictStartIndex ? -+ MEM_read32(idxBase + idx) : -+ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ -+ -+ /* check match at ip[0] */ -+ if (MEM_read32(ip0) == mval) { -+ /* found a match! */ -+ goto _offset; -+ } } -+ -+ /* lookup ip[1] */ -+ idx = hashTable[hash1]; -+ idxBase = idx < prefixStartIndex ? dictBase : base; -+ -+ /* hash ip[2] */ -+ hash0 = hash1; -+ hash1 = ZSTD_hashPtr(ip2, hlog, mls); -+ -+ /* advance to next positions */ -+ ip0 = ip1; -+ ip1 = ip2; -+ ip2 = ip0 + step; -+ ip3 = ip1 + step; -+ -+ /* calculate step */ -+ if (ip2 >= nextStep) { -+ step++; -+ PREFETCH_L1(ip1 + 64); -+ PREFETCH_L1(ip1 + 128); -+ nextStep += kStepIncr; -+ } -+ } while (ip3 < ilimit); -+ -+_cleanup: -+ /* Note that there are probably still a couple positions we could search. -+ * However, it seems to be a meaningful performance hit to try to search -+ * them. So let's not. */ -+ -+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), -+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ -+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; - - /* save reps for next block */ -- rep[0] = offset_1; -- rep[1] = offset_2; -+ rep[0] = offset_1 ? offset_1 : offsetSaved1; -+ rep[1] = offset_2 ? offset_2 : offsetSaved2; - - /* Return the last literals size */ - return (size_t)(iend - anchor); -+ -+_offset: /* Requires: ip0, idx, idxBase */ -+ -+ /* Compute the offset code. */ -+ { U32 const offset = current0 - idx; -+ const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart; -+ matchEnd = idx < prefixStartIndex ? dictEnd : iend; -+ match0 = idxBase + idx; -+ offset_2 = offset_1; -+ offset_1 = offset; -+ offcode = OFFSET_TO_OFFBASE(offset); -+ mLength = 4; -+ -+ /* Count the backwards match length. */ -+ while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) { -+ ip0--; -+ match0--; -+ mLength++; -+ } } -+ -+_match: /* Requires: ip0, match0, offcode, matchEnd */ -+ -+ /* Count the forward length. */ -+ assert(matchEnd != 0); -+ mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart); -+ -+ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); -+ -+ ip0 += mLength; -+ anchor = ip0; -+ -+ /* write next hash table entry */ -+ if (ip1 < ip0) { -+ hashTable[hash1] = (U32)(ip1 - base); -+ } -+ -+ /* Fill table and check for immediate repcode. */ -+ if (ip0 <= ilimit) { -+ /* Fill Table */ -+ assert(base+current0+2 > istart); /* check base overflow */ -+ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ -+ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); -+ -+ while (ip0 <= ilimit) { -+ U32 const repIndex2 = (U32)(ip0-base) - offset_2; -+ const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; -+ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */ -+ && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { -+ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; -+ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; -+ { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ -+ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); -+ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); -+ ip0 += repLength2; -+ anchor = ip0; -+ continue; -+ } -+ break; -+ } } -+ -+ goto _start; - } - - ZSTD_GEN_FAST_FN(extDict, 4, 0) -@@ -660,6 +953,7 @@ size_t ZSTD_compressBlock_fast_extDict( - void const* src, size_t srcSize) - { - U32 const mls = ms->cParams.minMatch; -+ assert(ms->dictMatchState == NULL); - switch(mls) - { - default: /* includes case 3 */ -diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h -index fddc2f532d21..e64d9e1b2d39 100644 ---- a/lib/zstd/compress/zstd_fast.h -+++ b/lib/zstd/compress/zstd_fast.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -16,7 +17,8 @@ - #include "zstd_compress_internal.h" - - void ZSTD_fillHashTable(ZSTD_matchState_t* ms, -- void const* end, ZSTD_dictTableLoadMethod_e dtlm); -+ void const* end, ZSTD_dictTableLoadMethod_e dtlm, -+ ZSTD_tableFillPurpose_e tfp); - size_t ZSTD_compressBlock_fast( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); -diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c -index 0298a01a7504..3e88d8a1a136 100644 ---- a/lib/zstd/compress/zstd_lazy.c -+++ b/lib/zstd/compress/zstd_lazy.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -10,14 +11,23 @@ - - #include "zstd_compress_internal.h" - #include "zstd_lazy.h" -+#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ -+ -+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) -+ -+#define kLazySkippingStep 8 - - - /*-************************************* - * Binary Tree search - ***************************************/ - --static void --ZSTD_updateDUBT(ZSTD_matchState_t* ms, -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_updateDUBT(ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iend, - U32 mls) - { -@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms, - * sort one already inserted but unsorted position - * assumption : curr >= btlow == (curr - btmask) - * doesn't fail */ --static void --ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, - U32 curr, const BYTE* inputEnd, - U32 nbCompares, U32 btLow, - const ZSTD_dictMode_e dictMode) -@@ -149,8 +160,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, - } - - --static size_t --ZSTD_DUBT_findBetterDictMatch ( -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t ZSTD_DUBT_findBetterDictMatch ( - const ZSTD_matchState_t* ms, - const BYTE* const ip, const BYTE* const iend, - size_t* offsetPtr, -@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch ( - U32 matchIndex = dictMatchIndex + dictIndexDelta; - if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { - DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", -- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); -- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); -+ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); -+ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); - } - if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ - break; /* drop, to guarantee consistency (miss a little bit of compression) */ -@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch ( - } - - if (bestLength >= MINMATCH) { -- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; -+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; - DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", - curr, (U32)bestLength, (U32)*offsetPtr, mIndex); - } -@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch ( - } - - --static size_t --ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, - const BYTE* const ip, const BYTE* const iend, -- size_t* offsetPtr, -+ size_t* offBasePtr, - U32 const mls, - const ZSTD_dictMode_e dictMode) - { -@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, - if (matchLength > bestLength) { - if (matchLength > matchEndIdx - matchIndex) - matchEndIdx = matchIndex + (U32)matchLength; -- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) -- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); -+ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) -+ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); - if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ - if (dictMode == ZSTD_dictMatchState) { - nbCompares = 0; /* in addition to avoiding checking any -@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, - if (dictMode == ZSTD_dictMatchState && nbCompares) { - bestLength = ZSTD_DUBT_findBetterDictMatch( - ms, ip, iend, -- offsetPtr, bestLength, nbCompares, -+ offBasePtr, bestLength, nbCompares, - mls, dictMode); - } - - assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ - ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ - if (bestLength >= MINMATCH) { -- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; -+ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; - DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", -- curr, (U32)bestLength, (U32)*offsetPtr, mIndex); -+ curr, (U32)bestLength, (U32)*offBasePtr, mIndex); - } - return bestLength; - } -@@ -378,17 +391,18 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, - - - /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */ --FORCE_INLINE_TEMPLATE size_t --ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, -+FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, - const BYTE* const ip, const BYTE* const iLimit, -- size_t* offsetPtr, -+ size_t* offBasePtr, - const U32 mls /* template */, - const ZSTD_dictMode_e dictMode) - { - DEBUGLOG(7, "ZSTD_BtFindBestMatch"); - if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ - ZSTD_updateDUBT(ms, ip, iLimit, mls); -- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); -+ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); - } - - /* ********************************* -@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb - /* save best solution */ - if (currentMl > ml) { - ml = currentMl; -- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); -+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); - if (ip+currentMl == iLimit) { - /* best possible, avoids read overflow on next attempt */ - return ml; -@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb - /* save best solution */ - if (currentMl > ml) { - ml = currentMl; -- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); -+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); - if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ - } - } -@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb - - /* Update chains up to ip (excluded) - Assumption : always within prefix (i.e. not within extDict) */ --FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( -+FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+U32 ZSTD_insertAndFindFirstIndex_internal( - ZSTD_matchState_t* ms, - const ZSTD_compressionParameters* const cParams, -- const BYTE* ip, U32 const mls) -+ const BYTE* ip, U32 const mls, U32 const lazySkipping) - { - U32* const hashTable = ms->hashTable; - const U32 hashLog = cParams->hashLog; -@@ -632,6 +648,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( - NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; - hashTable[h] = idx; - idx++; -+ /* Stop inserting every position when in the lazy skipping mode. */ -+ if (lazySkipping) -+ break; - } - - ms->nextToUpdate = target; -@@ -640,11 +659,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( - - U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { - const ZSTD_compressionParameters* const cParams = &ms->cParams; -- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); -+ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); - } - - /* inlining is important to hardwire a hot branch (template emulation) */ - FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - size_t ZSTD_HcFindBestMatch( - ZSTD_matchState_t* ms, - const BYTE* const ip, const BYTE* const iLimit, -@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch( - } - - /* HC4 match finder */ -- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); -+ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); - - for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { - size_t currentMl=0; - if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { - const BYTE* const match = base + matchIndex; - assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ -- if (match[ml] == ip[ml]) /* potentially better */ -+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ -+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ - currentMl = ZSTD_count(ip, match, iLimit); - } else { - const BYTE* const match = dictBase + matchIndex; -@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch( - /* save best solution */ - if (currentMl > ml) { - ml = currentMl; -- *offsetPtr = STORE_OFFSET(curr - matchIndex); -+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); - if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ - } - -@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch( - if (currentMl > ml) { - ml = currentMl; - assert(curr > matchIndex + dmsIndexDelta); -- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); -+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); - if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ - } - -@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch( - * (SIMD) Row-based matchfinder - ***********************************/ - /* Constants for row-based hash */ --#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ --#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ - #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) - #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ - -@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr - * Starting from the LSB, returns the idx of the next non-zero bit. - * Basically counting the nb of trailing zeroes. - */ --static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { -- assert(val != 0); --# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) -- if (sizeof(size_t) == 4) { -- U32 mostSignificantWord = (U32)(val >> 32); -- U32 leastSignificantWord = (U32)val; -- if (leastSignificantWord == 0) { -- return 32 + (U32)__builtin_ctz(mostSignificantWord); -- } else { -- return (U32)__builtin_ctz(leastSignificantWord); -- } -- } else { -- return (U32)__builtin_ctzll(val); -- } --# else -- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count -- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer -- */ -- val = ~val & (val - 1ULL); /* Lowest set bit mask */ -- val = val - ((val >> 1) & 0x5555555555555555); -- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); -- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); --# endif --} -- --/* ZSTD_rotateRight_*(): -- * Rotates a bitfield to the right by "count" bits. -- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts -- */ --FORCE_INLINE_TEMPLATE --U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { -- assert(count < 64); -- count &= 0x3F; /* for fickle pattern recognition */ -- return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); --} -- --FORCE_INLINE_TEMPLATE --U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { -- assert(count < 32); -- count &= 0x1F; /* for fickle pattern recognition */ -- return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); --} -- --FORCE_INLINE_TEMPLATE --U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { -- assert(count < 16); -- count &= 0x0F; /* for fickle pattern recognition */ -- return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); -+MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { -+ return ZSTD_countTrailingZeros64(val); - } - - /* ZSTD_row_nextIndex(): - * Returns the next index to insert at within a tagTable row, and updates the "head" -- * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) -+ * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) - */ - FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { -- U32 const next = (*tagRow - 1) & rowMask; -- *tagRow = (BYTE)next; -- return next; -+ U32 next = (*tagRow-1) & rowMask; -+ next += (next == 0) ? rowMask : 0; /* skip first position */ -+ *tagRow = (BYTE)next; -+ return next; - } - - /* ZSTD_isAligned(): -@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { - /* ZSTD_row_prefetch(): - * Performs prefetching for the hashTable and tagTable at a given row. - */ --FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { -+FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { - PREFETCH_L1(hashTable + relRow); - if (rowLog >= 5) { - PREFETCH_L1(hashTable + relRow + 16); -@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* ta - * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, - * but not beyond iLimit. - */ --FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, -+FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, - U32 const rowLog, U32 const mls, - U32 idx, const BYTE* const iLimit) - { - U32 const* const hashTable = ms->hashTable; -- U16 const* const tagTable = ms->tagTable; -+ BYTE const* const tagTable = ms->tagTable; - U32 const hashLog = ms->rowHashLog; - U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); - U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); - - for (; idx < lim; ++idx) { -- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); -+ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); - U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; - ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); - ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; -@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B - * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at - * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. - */ --FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, -- U16 const* tagTable, BYTE const* base, -+FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, -+ BYTE const* tagTable, BYTE const* base, - U32 idx, U32 const hashLog, -- U32 const rowLog, U32 const mls) -+ U32 const rowLog, U32 const mls, -+ U64 const hashSalt) - { -- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); -+ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); - U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; - ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); - { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; -@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab - /* ZSTD_row_update_internalImpl(): - * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. - */ --FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, -- U32 updateStartIdx, U32 const updateEndIdx, -- U32 const mls, U32 const rowLog, -- U32 const rowMask, U32 const useCache) -+FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, -+ U32 updateStartIdx, U32 const updateEndIdx, -+ U32 const mls, U32 const rowLog, -+ U32 const rowMask, U32 const useCache) - { - U32* const hashTable = ms->hashTable; -- U16* const tagTable = ms->tagTable; -+ BYTE* const tagTable = ms->tagTable; - U32 const hashLog = ms->rowHashLog; - const BYTE* const base = ms->window.base; - - DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); - for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { -- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) -- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); -+ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) -+ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); - U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; - U32* const row = hashTable + relRow; -- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. -- Explicit cast allows us to get exact desired position within each row */ -+ BYTE* tagRow = tagTable + relRow; - U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); - -- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); -- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; -+ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); -+ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; - row[pos] = updateStartIdx; - } - } -@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, - * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. - * Skips sections of long matches as is necessary. - */ --FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, -- U32 const mls, U32 const rowLog, -- U32 const rowMask, U32 const useCache) -+FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, -+ U32 const mls, U32 const rowLog, -+ U32 const rowMask, U32 const useCache) - { - U32 idx = ms->nextToUpdate; - const BYTE* const base = ms->window.base; -@@ -971,7 +953,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { - const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); - - DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); -- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); -+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); -+} -+ -+/* Returns the mask width of bits group of which will be set to 1. Given not all -+ * architectures have easy movemask instruction, this helps to iterate over -+ * groups of bits easier and faster. -+ */ -+FORCE_INLINE_TEMPLATE U32 -+ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) -+{ -+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); -+ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); -+ (void)rowEntries; -+#if defined(ZSTD_ARCH_ARM_NEON) -+ /* NEON path only works for little endian */ -+ if (!MEM_isLittleEndian()) { -+ return 1; -+ } -+ if (rowEntries == 16) { -+ return 4; -+ } -+ if (rowEntries == 32) { -+ return 2; -+ } -+ if (rowEntries == 64) { -+ return 1; -+ } -+#endif -+ return 1; - } - - #if defined(ZSTD_ARCH_X86_SSE2) -@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U - } - #endif - --/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches -- * the hash at the nth position in a row of the tagTable. -- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield -- * to match up with the actual layout of the entries within the hashTable */ -+#if defined(ZSTD_ARCH_ARM_NEON) -+FORCE_INLINE_TEMPLATE ZSTD_VecMask -+ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) -+{ -+ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); -+ if (rowEntries == 16) { -+ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. -+ * After that groups of 4 bits represent the equalMask. We lower -+ * all bits except the highest in these groups by doing AND with -+ * 0x88 = 0b10001000. -+ */ -+ const uint8x16_t chunk = vld1q_u8(src); -+ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); -+ const uint8x8_t res = vshrn_n_u16(equalMask, 4); -+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); -+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; -+ } else if (rowEntries == 32) { -+ /* Same idea as with rowEntries == 16 but doing AND with -+ * 0x55 = 0b01010101. -+ */ -+ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); -+ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); -+ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); -+ const uint8x16_t dup = vdupq_n_u8(tag); -+ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); -+ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); -+ const uint8x8_t res = vsli_n_u8(t0, t1, 4); -+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; -+ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; -+ } else { /* rowEntries == 64 */ -+ const uint8x16x4_t chunk = vld4q_u8(src); -+ const uint8x16_t dup = vdupq_n_u8(tag); -+ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); -+ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); -+ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); -+ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); -+ -+ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); -+ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); -+ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); -+ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); -+ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); -+ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); -+ return ZSTD_rotateRight_U64(matches, headGrouped); -+ } -+} -+#endif -+ -+/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by -+ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" -+ * matches the hash at the nth position in a row of the tagTable. -+ * Each row is a circular buffer beginning at the value of "headGrouped". So we -+ * must rotate the "matches" bitfield to match up with the actual layout of the -+ * entries within the hashTable */ - FORCE_INLINE_TEMPLATE ZSTD_VecMask --ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) -+ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) - { -- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; -+ const BYTE* const src = tagRow; - assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); - assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); -+ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); - - #if defined(ZSTD_ARCH_X86_SSE2) - -- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); -+ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); - - #else /* SW or NEON-LE */ - - # if defined(ZSTD_ARCH_ARM_NEON) - /* This NEON path only works for little endian - otherwise use SWAR below */ - if (MEM_isLittleEndian()) { -- if (rowEntries == 16) { -- const uint8x16_t chunk = vld1q_u8(src); -- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); -- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); -- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); -- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); -- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); -- const U16 hi = (U16)vgetq_lane_u8(t3, 8); -- const U16 lo = (U16)vgetq_lane_u8(t3, 0); -- return ZSTD_rotateRight_U16((hi << 8) | lo, head); -- } else if (rowEntries == 32) { -- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); -- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); -- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); -- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); -- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); -- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); -- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); -- const uint8x8_t t0 = vreinterpret_u8_s8(pack0); -- const uint8x8_t t1 = vreinterpret_u8_s8(pack1); -- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); -- const uint8x8x2_t t3 = vuzp_u8(t2, t0); -- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); -- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); -- return ZSTD_rotateRight_U32(matches, head); -- } else { /* rowEntries == 64 */ -- const uint8x16x4_t chunk = vld4q_u8(src); -- const uint8x16_t dup = vdupq_n_u8(tag); -- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); -- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); -- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); -- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); -- -- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); -- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); -- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); -- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); -- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); -- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); -- return ZSTD_rotateRight_U64(matches, head); -- } -+ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); - } - # endif /* ZSTD_ARCH_ARM_NEON */ - /* SWAR */ -- { const size_t chunkSize = sizeof(size_t); -+ { const int chunkSize = sizeof(size_t); - const size_t shiftAmount = ((chunkSize * 8) - chunkSize); - const size_t xFF = ~((size_t)0); - const size_t x01 = xFF / 0xFF; -@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, - } - matches = ~matches; - if (rowEntries == 16) { -- return ZSTD_rotateRight_U16((U16)matches, head); -+ return ZSTD_rotateRight_U16((U16)matches, headGrouped); - } else if (rowEntries == 32) { -- return ZSTD_rotateRight_U32((U32)matches, head); -+ return ZSTD_rotateRight_U32((U32)matches, headGrouped); - } else { -- return ZSTD_rotateRight_U64((U64)matches, head); -+ return ZSTD_rotateRight_U64((U64)matches, headGrouped); - } - } - #endif -@@ -1103,20 +1124,21 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, - - /* The high-level approach of the SIMD row based match finder is as follows: - * - Figure out where to insert the new entry: -- * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" -- * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines -+ * - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index. -+ * - The hash is salted by a value that changes on every contex reset, so when the same table is used -+ * we will avoid collisions that would otherwise slow us down by intorducing phantom matches. -+ * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines - * which row to insert into. -- * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can -- * be considered as a circular buffer with a "head" index that resides in the tagTable. -- * - Also insert the "tag" into the equivalent row and position in the tagTable. -- * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry. -- * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively, -- * for alignment/performance reasons, leaving some bytes unused. -- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and -+ * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can -+ * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes -+ * per row). -+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and - * generate a bitfield that we can cycle through to check the collisions in the hash table. - * - Pick the longest match. -+ * - Insert the tag into the equivalent row and position in the tagTable. - */ - FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - size_t ZSTD_RowFindBestMatch( - ZSTD_matchState_t* ms, - const BYTE* const ip, const BYTE* const iLimit, -@@ -1125,7 +1147,7 @@ size_t ZSTD_RowFindBestMatch( - const U32 rowLog) - { - U32* const hashTable = ms->hashTable; -- U16* const tagTable = ms->tagTable; -+ BYTE* const tagTable = ms->tagTable; - U32* const hashCache = ms->hashCache; - const U32 hashLog = ms->rowHashLog; - const ZSTD_compressionParameters* const cParams = &ms->cParams; -@@ -1143,8 +1165,11 @@ size_t ZSTD_RowFindBestMatch( - const U32 rowEntries = (1U << rowLog); - const U32 rowMask = rowEntries - 1; - const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ -+ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); -+ const U64 hashSalt = ms->hashSalt; - U32 nbAttempts = 1U << cappedSearchLog; - size_t ml=4-1; -+ U32 hash; - - /* DMS/DDS variables that may be referenced laster */ - const ZSTD_matchState_t* const dms = ms->dictMatchState; -@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch( - if (dictMode == ZSTD_dictMatchState) { - /* Prefetch DMS rows */ - U32* const dmsHashTable = dms->hashTable; -- U16* const dmsTagTable = dms->tagTable; -+ BYTE* const dmsTagTable = dms->tagTable; - U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); - U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; - dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; -@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch( - } - - /* Update the hashTable and tagTable up to (but not including) ip */ -- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); -+ if (!ms->lazySkipping) { -+ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); -+ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); -+ } else { -+ /* Stop inserting every position when in the lazy skipping mode. -+ * The hash cache is also not kept up to date in this mode. -+ */ -+ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); -+ ms->nextToUpdate = curr; -+ } -+ ms->hashSaltEntropy += hash; /* collect salt entropy */ -+ - { /* Get the hash for ip, compute the appropriate row */ -- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); - U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; - U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; - U32* const row = hashTable + relRow; - BYTE* tagRow = (BYTE*)(tagTable + relRow); -- U32 const head = *tagRow & rowMask; -+ U32 const headGrouped = (*tagRow & rowMask) * groupWidth; - U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; - size_t numMatches = 0; - size_t currMatch = 0; -- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); -+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); - - /* Cycle through the matches and prefetch */ -- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { -- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; -+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { -+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; - U32 const matchIndex = row[matchPos]; -+ if(matchPos == 0) continue; - assert(numMatches < rowEntries); - if (matchIndex < lowLimit) - break; -@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch( - PREFETCH_L1(dictBase + matchIndex); - } - matchBuffer[numMatches++] = matchIndex; -+ --nbAttempts; - } - - /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop - in ZSTD_row_update_internal() at the next search. */ - { - U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); -- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; -+ tagRow[pos] = (BYTE)tag; - row[pos] = ms->nextToUpdate++; - } - -@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch( - if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { - const BYTE* const match = base + matchIndex; - assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ -- if (match[ml] == ip[ml]) /* potentially better */ -+ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ -+ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ - currentMl = ZSTD_count(ip, match, iLimit); - } else { - const BYTE* const match = dictBase + matchIndex; -@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch( - /* Save best solution */ - if (currentMl > ml) { - ml = currentMl; -- *offsetPtr = STORE_OFFSET(curr - matchIndex); -+ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); - if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ - } - } -@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch( - const U32 dmsSize = (U32)(dmsEnd - dmsBase); - const U32 dmsIndexDelta = dictLimit - dmsSize; - -- { U32 const head = *dmsTagRow & rowMask; -+ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; - U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; - size_t numMatches = 0; - size_t currMatch = 0; -- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); -+ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); - -- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { -- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; -+ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { -+ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; - U32 const matchIndex = dmsRow[matchPos]; -+ if(matchPos == 0) continue; - if (matchIndex < dmsLowestIndex) - break; - PREFETCH_L1(dmsBase + matchIndex); - matchBuffer[numMatches++] = matchIndex; -+ --nbAttempts; - } - - /* Return the longest match */ -@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch( - if (currentMl > ml) { - ml = currentMl; - assert(curr > matchIndex + dmsIndexDelta); -- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); -+ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); - if (ip+currentMl == iLimit) break; - } - } -@@ -1472,8 +1512,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( - * Common parser - lazy strategy - *********************************/ - --FORCE_INLINE_TEMPLATE size_t --ZSTD_compressBlock_lazy_generic( -+FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t ZSTD_compressBlock_lazy_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, - U32 rep[ZSTD_REP_NUM], - const void* src, size_t srcSize, -@@ -1491,7 +1532,8 @@ ZSTD_compressBlock_lazy_generic( - const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); - const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); - -- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; -+ U32 offset_1 = rep[0], offset_2 = rep[1]; -+ U32 offsetSaved1 = 0, offsetSaved2 = 0; - - const int isDMS = dictMode == ZSTD_dictMatchState; - const int isDDS = dictMode == ZSTD_dedicatedDictSearch; -@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic( - U32 const curr = (U32)(ip - base); - U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); - U32 const maxRep = curr - windowLow; -- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; -- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; -+ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; -+ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; - } - if (isDxS) { - /* dictMatchState repCode checks don't currently handle repCode == 0 -@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic( - assert(offset_2 <= dictAndPrefixLength); - } - -+ /* Reset the lazy skipping state */ -+ ms->lazySkipping = 0; -+ - if (searchMethod == search_rowHash) { -- ZSTD_row_fillHashCache(ms, base, rowLog, -- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), -- ms->nextToUpdate, ilimit); -+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); - } - - /* Match Loop */ -@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic( - #endif - while (ip < ilimit) { - size_t matchLength=0; -- size_t offcode=STORE_REPCODE_1; -+ size_t offBase = REPCODE1_TO_OFFBASE; - const BYTE* start=ip+1; - DEBUGLOG(7, "search baseline (depth 0)"); - -@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic( - } - - /* first search (depth 0) */ -- { size_t offsetFound = 999999999; -- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode); -+ { size_t offbaseFound = 999999999; -+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); - if (ml2 > matchLength) -- matchLength = ml2, start = ip, offcode=offsetFound; -+ matchLength = ml2, start = ip, offBase = offbaseFound; - } - - if (matchLength < 4) { -- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ -+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; -+ ip += step; -+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. -+ * In this mode we stop inserting every position into our tables, and only insert -+ * positions that we search, which is one in step positions. -+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, -+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets -+ * triggered once we've gone 2KB without finding any matches. -+ */ -+ ms->lazySkipping = step > kLazySkippingStep; - continue; - } - -@@ -1579,12 +1631,12 @@ ZSTD_compressBlock_lazy_generic( - DEBUGLOG(7, "search depth 1"); - ip ++; - if ( (dictMode == ZSTD_noDict) -- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { -+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { - size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; - int const gain2 = (int)(mlRep * 3); -- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); -+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); - if ((mlRep >= 4) && (gain2 > gain1)) -- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; -+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; - } - if (isDxS) { - const U32 repIndex = (U32)(ip - base) - offset_1; -@@ -1596,17 +1648,17 @@ ZSTD_compressBlock_lazy_generic( - const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; - size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; - int const gain2 = (int)(mlRep * 3); -- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); -+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); - if ((mlRep >= 4) && (gain2 > gain1)) -- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; -+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; - } - } -- { size_t offset2=999999999; -- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); -- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ -- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); -+ { size_t ofbCandidate=999999999; -+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); -+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ -+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); - if ((ml2 >= 4) && (gain2 > gain1)) { -- matchLength = ml2, offcode = offset2, start = ip; -+ matchLength = ml2, offBase = ofbCandidate, start = ip; - continue; /* search a better one */ - } } - -@@ -1615,12 +1667,12 @@ ZSTD_compressBlock_lazy_generic( - DEBUGLOG(7, "search depth 2"); - ip ++; - if ( (dictMode == ZSTD_noDict) -- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { -+ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { - size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; - int const gain2 = (int)(mlRep * 4); -- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); -+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); - if ((mlRep >= 4) && (gain2 > gain1)) -- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; -+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; - } - if (isDxS) { - const U32 repIndex = (U32)(ip - base) - offset_1; -@@ -1632,17 +1684,17 @@ ZSTD_compressBlock_lazy_generic( - const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; - size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; - int const gain2 = (int)(mlRep * 4); -- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); -+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); - if ((mlRep >= 4) && (gain2 > gain1)) -- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; -+ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; - } - } -- { size_t offset2=999999999; -- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); -- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ -- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); -+ { size_t ofbCandidate=999999999; -+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); -+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ -+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); - if ((ml2 >= 4) && (gain2 > gain1)) { -- matchLength = ml2, offcode = offset2, start = ip; -+ matchLength = ml2, offBase = ofbCandidate, start = ip; - continue; - } } } - break; /* nothing found : store previous solution */ -@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic( - * notably if `value` is unsigned, resulting in a large positive `-value`. - */ - /* catch up */ -- if (STORED_IS_OFFSET(offcode)) { -+ if (OFFBASE_IS_OFFSET(offBase)) { - if (dictMode == ZSTD_noDict) { -- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) -- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ -+ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) -+ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ - { start--; matchLength++; } - } - if (isDxS) { -- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); -+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); - const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; - const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; - while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ - } -- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); -+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); - } - /* store sequence */ - _storeSequence: - { size_t const litLength = (size_t)(start - anchor); -- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); -+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); - anchor = ip = start + matchLength; - } -+ if (ms->lazySkipping) { -+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ -+ if (searchMethod == search_rowHash) { -+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); -+ } -+ ms->lazySkipping = 0; -+ } - - /* check immediate repcode */ - if (isDxS) { -@@ -1686,8 +1745,8 @@ ZSTD_compressBlock_lazy_generic( - && (MEM_read32(repMatch) == MEM_read32(ip)) ) { - const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; - matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; -- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ -- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); -+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ -+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); - ip += matchLength; - anchor = ip; - continue; -@@ -1701,166 +1760,181 @@ ZSTD_compressBlock_lazy_generic( - && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { - /* store sequence */ - matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; -- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ -- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); -+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ -+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); - ip += matchLength; - anchor = ip; - continue; /* faster when present ... (?) */ - } } } - -- /* Save reps for next block */ -- rep[0] = offset_1 ? offset_1 : savedOffset; -- rep[1] = offset_2 ? offset_2 : savedOffset; -+ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), -+ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ -+ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; -+ -+ /* save reps for next block */ -+ rep[0] = offset_1 ? offset_1 : offsetSaved1; -+ rep[1] = offset_2 ? offset_2 : offsetSaved2; - - /* Return the last literals size */ - return (size_t)(iend - anchor); - } -+#endif /* build exclusions */ - - --size_t ZSTD_compressBlock_btlazy2( -+#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_greedy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); - } - --size_t ZSTD_compressBlock_lazy2( -+size_t ZSTD_compressBlock_greedy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); - } - --size_t ZSTD_compressBlock_lazy( -+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); - } - --size_t ZSTD_compressBlock_greedy( -+size_t ZSTD_compressBlock_greedy_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); - } - --size_t ZSTD_compressBlock_btlazy2_dictMatchState( -+size_t ZSTD_compressBlock_greedy_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); - } - --size_t ZSTD_compressBlock_lazy2_dictMatchState( -+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); - } -+#endif - --size_t ZSTD_compressBlock_lazy_dictMatchState( -+#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_lazy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); - } - --size_t ZSTD_compressBlock_greedy_dictMatchState( -+size_t ZSTD_compressBlock_lazy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); - } - -- --size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( -+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); - } - --size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( -+size_t ZSTD_compressBlock_lazy_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); - } - --size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( -+size_t ZSTD_compressBlock_lazy_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); - } - --/* Row-based matchfinder */ --size_t ZSTD_compressBlock_lazy2_row( -+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); - } -+#endif - --size_t ZSTD_compressBlock_lazy_row( -+#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_lazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); - } - --size_t ZSTD_compressBlock_greedy_row( -+size_t ZSTD_compressBlock_lazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); - } - --size_t ZSTD_compressBlock_lazy2_dictMatchState_row( -+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); - } - --size_t ZSTD_compressBlock_lazy_dictMatchState_row( -+size_t ZSTD_compressBlock_lazy2_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); - } - --size_t ZSTD_compressBlock_greedy_dictMatchState_row( -+size_t ZSTD_compressBlock_lazy2_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); - } - -- - size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { - return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); - } -+#endif - --size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( -+#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_btlazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); - } - --size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( -+size_t ZSTD_compressBlock_btlazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - { -- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); -+ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); - } -+#endif - -+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) - FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - size_t ZSTD_compressBlock_lazy_extDict_generic( - ZSTD_matchState_t* ms, seqStore_t* seqStore, - U32 rep[ZSTD_REP_NUM], -@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( - - DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); - -+ /* Reset the lazy skipping state */ -+ ms->lazySkipping = 0; -+ - /* init */ - ip += (ip == prefixStart); - if (searchMethod == search_rowHash) { -- ZSTD_row_fillHashCache(ms, base, rowLog, -- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), -- ms->nextToUpdate, ilimit); -+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); - } - - /* Match Loop */ -@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( - #endif - while (ip < ilimit) { - size_t matchLength=0; -- size_t offcode=STORE_REPCODE_1; -+ size_t offBase = REPCODE1_TO_OFFBASE; - const BYTE* start=ip+1; - U32 curr = (U32)(ip-base); - -@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( - } } - - /* first search (depth 0) */ -- { size_t offsetFound = 999999999; -- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict); -+ { size_t ofbCandidate = 999999999; -+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); - if (ml2 > matchLength) -- matchLength = ml2, start = ip, offcode=offsetFound; -+ matchLength = ml2, start = ip, offBase = ofbCandidate; - } - - if (matchLength < 4) { -- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ -+ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); -+ ip += step + 1; /* jump faster over incompressible sections */ -+ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. -+ * In this mode we stop inserting every position into our tables, and only insert -+ * positions that we search, which is one in step positions. -+ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, -+ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets -+ * triggered once we've gone 2KB without finding any matches. -+ */ -+ ms->lazySkipping = step > kLazySkippingStep; - continue; - } - -@@ -1939,7 +2023,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( - ip ++; - curr++; - /* check repCode */ -- if (offcode) { -+ if (offBase) { - const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); - const U32 repIndex = (U32)(curr - offset_1); - const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; -@@ -1951,18 +2035,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( - const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; - size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; - int const gain2 = (int)(repLength * 3); -- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); -+ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); - if ((repLength >= 4) && (gain2 > gain1)) -- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; -+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; - } } - - /* search match, depth 1 */ -- { size_t offset2=999999999; -- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); -- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ -- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); -+ { size_t ofbCandidate = 999999999; -+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); -+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ -+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); - if ((ml2 >= 4) && (gain2 > gain1)) { -- matchLength = ml2, offcode = offset2, start = ip; -+ matchLength = ml2, offBase = ofbCandidate, start = ip; - continue; /* search a better one */ - } } - -@@ -1971,7 +2055,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( - ip ++; - curr++; - /* check repCode */ -- if (offcode) { -+ if (offBase) { - const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); - const U32 repIndex = (U32)(curr - offset_1); - const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; -@@ -1983,38 +2067,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( - const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; - size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; - int const gain2 = (int)(repLength * 4); -- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); -+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); - if ((repLength >= 4) && (gain2 > gain1)) -- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; -+ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; - } } - - /* search match, depth 2 */ -- { size_t offset2=999999999; -- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); -- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ -- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); -+ { size_t ofbCandidate = 999999999; -+ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); -+ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ -+ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); - if ((ml2 >= 4) && (gain2 > gain1)) { -- matchLength = ml2, offcode = offset2, start = ip; -+ matchLength = ml2, offBase = ofbCandidate, start = ip; - continue; - } } } - break; /* nothing found : store previous solution */ - } - - /* catch up */ -- if (STORED_IS_OFFSET(offcode)) { -- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); -+ if (OFFBASE_IS_OFFSET(offBase)) { -+ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); - const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; - const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; - while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ -- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); -+ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); - } - - /* store sequence */ - _storeSequence: - { size_t const litLength = (size_t)(start - anchor); -- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); -+ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); - anchor = ip = start + matchLength; - } -+ if (ms->lazySkipping) { -+ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ -+ if (searchMethod == search_rowHash) { -+ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); -+ } -+ ms->lazySkipping = 0; -+ } - - /* check immediate repcode */ - while (ip <= ilimit) { -@@ -2029,8 +2120,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( - /* repcode detected we should take it */ - const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; - matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; -- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ -- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); -+ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ -+ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); - ip += matchLength; - anchor = ip; - continue; /* faster when present ... (?) */ -@@ -2045,8 +2136,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( - /* Return the last literals size */ - return (size_t)(iend - anchor); - } -+#endif /* build exclusions */ - -- -+#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR - size_t ZSTD_compressBlock_greedy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) -@@ -2054,49 +2146,55 @@ size_t ZSTD_compressBlock_greedy_extDict( - return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); - } - --size_t ZSTD_compressBlock_lazy_extDict( -+size_t ZSTD_compressBlock_greedy_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) -- - { -- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); -+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); - } -+#endif - --size_t ZSTD_compressBlock_lazy2_extDict( -+#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_lazy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - - { -- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); -+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); - } - --size_t ZSTD_compressBlock_btlazy2_extDict( -+size_t ZSTD_compressBlock_lazy_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - - { -- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); -+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); - } -+#endif - --size_t ZSTD_compressBlock_greedy_extDict_row( -+#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_lazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) -+ - { -- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); -+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); - } - --size_t ZSTD_compressBlock_lazy_extDict_row( -+size_t ZSTD_compressBlock_lazy2_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) -- - { -- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); -+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); - } -+#endif - --size_t ZSTD_compressBlock_lazy2_extDict_row( -+#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_btlazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize) - - { -- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); -+ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); - } -+#endif -diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h -index e5bdf4df8dde..22c9201f4e63 100644 ---- a/lib/zstd/compress/zstd_lazy.h -+++ b/lib/zstd/compress/zstd_lazy.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -22,98 +23,175 @@ - */ - #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 - -+#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ -+ -+#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) - U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); - void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); - - void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); - - void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */ -+#endif - --size_t ZSTD_compressBlock_btlazy2( -+#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_greedy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy2( -+size_t ZSTD_compressBlock_greedy_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy( -+size_t ZSTD_compressBlock_greedy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_greedy( -+size_t ZSTD_compressBlock_greedy_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy2_row( -+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy_row( -+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_greedy_row( -+size_t ZSTD_compressBlock_greedy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); -- --size_t ZSTD_compressBlock_btlazy2_dictMatchState( -+size_t ZSTD_compressBlock_greedy_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy2_dictMatchState( -+ -+#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy -+#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row -+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState -+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row -+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch -+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row -+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict -+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row -+#else -+#define ZSTD_COMPRESSBLOCK_GREEDY NULL -+#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL -+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL -+#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL -+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL -+#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL -+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL -+#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL -+#endif -+ -+#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_lazy( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy_dictMatchState( -+size_t ZSTD_compressBlock_lazy_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_greedy_dictMatchState( -+size_t ZSTD_compressBlock_lazy_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy2_dictMatchState_row( -+size_t ZSTD_compressBlock_lazy_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy_dictMatchState_row( -+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_greedy_dictMatchState_row( -+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); -- --size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( -+size_t ZSTD_compressBlock_lazy_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( -+size_t ZSTD_compressBlock_lazy_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( -+ -+#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy -+#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row -+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState -+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row -+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch -+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row -+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict -+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row -+#else -+#define ZSTD_COMPRESSBLOCK_LAZY NULL -+#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL -+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL -+#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL -+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL -+#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL -+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL -+#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL -+#endif -+ -+#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_lazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( -+size_t ZSTD_compressBlock_lazy2_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( -+size_t ZSTD_compressBlock_lazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( -+size_t ZSTD_compressBlock_lazy2_dictMatchState_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); -- --size_t ZSTD_compressBlock_greedy_extDict( -+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy_extDict( -+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); - size_t ZSTD_compressBlock_lazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_greedy_extDict_row( -+size_t ZSTD_compressBlock_lazy2_extDict_row( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy_extDict_row( -+ -+#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2 -+#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row -+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState -+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row -+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch -+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row -+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict -+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row -+#else -+#define ZSTD_COMPRESSBLOCK_LAZY2 NULL -+#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL -+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL -+#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL -+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL -+#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL -+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL -+#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL -+#endif -+ -+#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_btlazy2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_lazy2_extDict_row( -+size_t ZSTD_compressBlock_btlazy2_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); - size_t ZSTD_compressBlock_btlazy2_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); -- -+ -+#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2 -+#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState -+#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict -+#else -+#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL -+#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL -+#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL -+#endif -+ - - - #endif /* ZSTD_LAZY_H */ -diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c -index dd86fc83e7dd..07f3bc6437ce 100644 ---- a/lib/zstd/compress/zstd_ldm.c -+++ b/lib/zstd/compress/zstd_ldm.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -242,11 +243,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, - switch(ms->cParams.strategy) - { - case ZSTD_fast: -- ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); -+ ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); - break; - - case ZSTD_dfast: -- ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); -+#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR -+ ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); -+#else -+ assert(0); /* shouldn't be called: cparams should've been adjusted. */ -+#endif - break; - - case ZSTD_greedy: -@@ -318,7 +323,9 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) - } - } - --static size_t ZSTD_ldm_generateSequences_internal( -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t ZSTD_ldm_generateSequences_internal( - ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, - ldmParams_t const* params, void const* src, size_t srcSize) - { -@@ -549,7 +556,7 @@ size_t ZSTD_ldm_generateSequences( - * the window through early invalidation. - * TODO: * Test the chunk size. - * * Try invalidation after the sequence generation and test the -- * the offset against maxDist directly. -+ * offset against maxDist directly. - * - * NOTE: Because of dictionaries + sequence splitting we MUST make sure - * that any offset used is valid at the END of the sequence, since it may -@@ -689,7 +696,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, - /* maybeSplitSequence updates rawSeqStore->pos */ - rawSeq const sequence = maybeSplitSequence(rawSeqStore, - (U32)(iend - ip), minMatch); -- int i; - /* End signal */ - if (sequence.offset == 0) - break; -@@ -702,6 +708,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, - /* Run the block compressor */ - DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); - { -+ int i; - size_t const newLitLength = - blockCompressor(ms, seqStore, rep, ip, sequence.litLength); - ip += sequence.litLength; -@@ -711,7 +718,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, - rep[0] = sequence.offset; - /* Store the sequence */ - ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, -- STORE_OFFSET(sequence.offset), -+ OFFSET_TO_OFFBASE(sequence.offset), - sequence.matchLength); - ip += sequence.matchLength; - } -diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h -index fbc6a5e88fd7..c540731abde7 100644 ---- a/lib/zstd/compress/zstd_ldm.h -+++ b/lib/zstd/compress/zstd_ldm.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h -index 647f865be290..cfccfc46f6f7 100644 ---- a/lib/zstd/compress/zstd_ldm_geartab.h -+++ b/lib/zstd/compress/zstd_ldm_geartab.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c -index fd82acfda62f..a87b66ac8d24 100644 ---- a/lib/zstd/compress/zstd_opt.c -+++ b/lib/zstd/compress/zstd_opt.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -12,11 +13,14 @@ - #include "hist.h" - #include "zstd_opt.h" - -+#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) - - #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ - #define ZSTD_MAX_PRICE (1<<30) - --#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ -+#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ - - - /*-************************************* -@@ -26,27 +30,35 @@ - #if 0 /* approximation at bit level (for tests) */ - # define BITCOST_ACCURACY 0 - # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) --# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) -+# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat)) - #elif 0 /* fractional bit accuracy (for tests) */ - # define BITCOST_ACCURACY 8 - # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) --# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) -+# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat)) - #else /* opt==approx, ultra==accurate */ - # define BITCOST_ACCURACY 8 - # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) --# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) -+# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) - #endif - -+/* ZSTD_bitWeight() : -+ * provide estimated "cost" of a stat in full bits only */ - MEM_STATIC U32 ZSTD_bitWeight(U32 stat) - { - return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); - } - -+/* ZSTD_fracWeight() : -+ * provide fractional-bit "cost" of a stat, -+ * using linear interpolation approximation */ - MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) - { - U32 const stat = rawStat + 1; - U32 const hb = ZSTD_highbit32(stat); - U32 const BWeight = hb * BITCOST_MULTIPLIER; -+ /* Fweight was meant for "Fractional weight" -+ * but it's effectively a value between 1 and 2 -+ * using fixed point arithmetic */ - U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; - U32 const weight = BWeight + FWeight; - assert(hb + BITCOST_ACCURACY < 31); -@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) - /* debugging function, - * @return price in bytes as fractional value - * for debug messages only */ --MEM_STATIC double ZSTD_fCost(U32 price) -+MEM_STATIC double ZSTD_fCost(int price) - { - return (double)price / (BITCOST_MULTIPLIER*8); - } -@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts) - return total; - } - --static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift) -+typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e; -+ -+static U32 -+ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1) - { - U32 s, sum=0; -- DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift); -+ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", -+ (unsigned)lastEltIndex+1, (unsigned)shift ); - assert(shift < 30); - for (s=0; s> shift); -- sum += table[s]; -+ unsigned const base = base1 ? 1 : (table[s]>0); -+ unsigned const newStat = base + (table[s] >> shift); -+ sum += newStat; -+ table[s] = newStat; - } - return sum; - } - - /* ZSTD_scaleStats() : -- * reduce all elements in table is sum too large -+ * reduce all elt frequencies in table if sum too large - * return the resulting sum of elements */ - static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) - { -@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) - DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); - assert(logTarget < 30); - if (factor <= 1) return prevsum; -- return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); -+ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed); - } - - /* ZSTD_rescaleFreqs() : -@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, - DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); - optPtr->priceType = zop_dynamic; - -- if (optPtr->litLengthSum == 0) { /* first block : init */ -- if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */ -- DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef"); -+ if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */ -+ -+ /* heuristic: use pre-defined stats for too small inputs */ -+ if (srcSize <= ZSTD_PREDEF_THRESHOLD) { -+ DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD); - optPtr->priceType = zop_predef; - } - - assert(optPtr->symbolCosts != NULL); - if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { -- /* huffman table presumed generated by dictionary */ -+ -+ /* huffman stats covering the full value set : table presumed generated by dictionary */ - optPtr->priceType = zop_dynamic; - - if (compressedLiterals) { -+ /* generate literals statistics from huffman table */ - unsigned lit; - assert(optPtr->litFreq != NULL); - optPtr->litSum = 0; -@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, - optPtr->offCodeSum += optPtr->offCodeFreq[of]; - } } - -- } else { /* not a dictionary */ -+ } else { /* first block, no dictionary */ - - assert(optPtr->litFreq != NULL); - if (compressedLiterals) { -+ /* base initial cost of literals on direct frequency within src */ - unsigned lit = MaxLit; - HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ -- optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8); -+ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible); - } - - { unsigned const baseLLfreqs[MaxLL+1] = { -@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, - optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); - } - -- - } - -- } else { /* new block : re-use previous statistics, scaled down */ -+ } else { /* new block : scale down accumulated statistics */ - - if (compressedLiterals) - optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); -@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, - const optState_t* const optPtr, - int optLevel) - { -+ DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength); - if (litLength == 0) return 0; - - if (!ZSTD_compressedLiterals(optPtr)) -@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, - return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ - - /* dynamic statistics */ -- { U32 price = litLength * optPtr->litSumBasePrice; -+ { U32 price = optPtr->litSumBasePrice * litLength; -+ U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER; - U32 u; -+ assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER); - for (u=0; u < litLength; u++) { -- assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */ -- price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel); -+ U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel); -+ if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax; -+ price -= litPrice; - } - return price; - } -@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP - assert(litLength <= ZSTD_BLOCKSIZE_MAX); - if (optPtr->priceType == zop_predef) - return WEIGHT(litLength, optLevel); -- /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX -- * because it isn't representable in the zstd format. So instead just -- * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block -- * would be all literals. -+ -+ /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX -+ * because it isn't representable in the zstd format. -+ * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. -+ * In such a case, the block would be all literals. - */ - if (litLength == ZSTD_BLOCKSIZE_MAX) - return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); -@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP - } - - /* ZSTD_getMatchPrice() : -- * Provides the cost of the match part (offset + matchLength) of a sequence -+ * Provides the cost of the match part (offset + matchLength) of a sequence. - * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. -- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2 -+ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq() - * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) - */ - FORCE_INLINE_TEMPLATE U32 --ZSTD_getMatchPrice(U32 const offcode, -+ZSTD_getMatchPrice(U32 const offBase, - U32 const matchLength, - const optState_t* const optPtr, - int const optLevel) - { - U32 price; -- U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode)); -+ U32 const offCode = ZSTD_highbit32(offBase); - U32 const mlBase = matchLength - MINMATCH; - assert(matchLength >= MINMATCH); - -- if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ -- return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); -+ if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */ -+ return WEIGHT(mlBase, optLevel) -+ + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */ - - /* dynamic statistics */ - price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); -@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode, - } - - /* ZSTD_updateStats() : -- * assumption : literals + litLengtn <= iend */ -+ * assumption : literals + litLength <= iend */ - static void ZSTD_updateStats(optState_t* const optPtr, - U32 litLength, const BYTE* literals, -- U32 offsetCode, U32 matchLength) -+ U32 offBase, U32 matchLength) - { - /* literals */ - if (ZSTD_compressedLiterals(optPtr)) { -@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr, - optPtr->litLengthSum++; - } - -- /* offset code : expected to follow storeSeq() numeric representation */ -- { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)); -+ /* offset code : follows storeSeq() numeric representation */ -+ { U32 const offCode = ZSTD_highbit32(offBase); - assert(offCode <= MaxOff); - optPtr->offCodeFreq[offCode]++; - optPtr->offCodeSum++; -@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) - - /* Update hashTable3 up to ip (excluded) - Assumption : always within prefix (i.e. not within extDict) */ --static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, -- U32* nextToUpdate3, -- const BYTE* const ip) -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, -+ U32* nextToUpdate3, -+ const BYTE* const ip) - { - U32* const hashTable3 = ms->hashTable3; - U32 const hashLog3 = ms->hashLog3; -@@ -408,7 +438,9 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, - * @param ip assumed <= iend-8 . - * @param target The target of ZSTD_updateTree_internal() - we are filling to this position - * @return : nb of positions added */ --static U32 ZSTD_insertBt1( -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+U32 ZSTD_insertBt1( - const ZSTD_matchState_t* ms, - const BYTE* const ip, const BYTE* const iend, - U32 const target, -@@ -527,6 +559,7 @@ static U32 ZSTD_insertBt1( - } - - FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - void ZSTD_updateTree_internal( - ZSTD_matchState_t* ms, - const BYTE* const ip, const BYTE* const iend, -@@ -535,7 +568,7 @@ void ZSTD_updateTree_internal( - const BYTE* const base = ms->window.base; - U32 const target = (U32)(ip - base); - U32 idx = ms->nextToUpdate; -- DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", -+ DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", - idx, target, dictMode); - - while(idx < target) { -@@ -553,15 +586,18 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { - } - - FORCE_INLINE_TEMPLATE --U32 ZSTD_insertBtAndGetAllMatches ( -- ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ -- ZSTD_matchState_t* ms, -- U32* nextToUpdate3, -- const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode, -- const U32 rep[ZSTD_REP_NUM], -- U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ -- const U32 lengthToBeat, -- U32 const mls /* template */) -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+U32 -+ZSTD_insertBtAndGetAllMatches ( -+ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ -+ ZSTD_matchState_t* ms, -+ U32* nextToUpdate3, -+ const BYTE* const ip, const BYTE* const iLimit, -+ const ZSTD_dictMode_e dictMode, -+ const U32 rep[ZSTD_REP_NUM], -+ const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ -+ const U32 lengthToBeat, -+ const U32 mls /* template */) - { - const ZSTD_compressionParameters* const cParams = &ms->cParams; - U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); -@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( - DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", - repCode, ll0, repOffset, repLen); - bestLength = repLen; -- matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */ -+ matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */ - matches[mnum].len = (U32)repLen; - mnum++; - if ( (repLen > sufficient_len) -@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( - bestLength = mlen; - assert(curr > matchIndex3); - assert(mnum==0); /* no prior solution */ -- matches[0].off = STORE_OFFSET(curr - matchIndex3); -+ matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3); - matches[0].len = (U32)mlen; - mnum = 1; - if ( (mlen > sufficient_len) | -@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( - } - - if (matchLength > bestLength) { -- DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", -- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); -+ DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)", -+ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); - assert(matchEndIdx > matchIndex); - if (matchLength > matchEndIdx - matchIndex) - matchEndIdx = matchIndex + (U32)matchLength; - bestLength = matchLength; -- matches[mnum].off = STORE_OFFSET(curr - matchIndex); -+ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); - matches[mnum].len = (U32)matchLength; - mnum++; - if ( (matchLength > ZSTD_OPT_NUM) -@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches ( - - if (matchLength > bestLength) { - matchIndex = dictMatchIndex + dmsIndexDelta; -- DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", -- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); -+ DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)", -+ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); - if (matchLength > matchEndIdx - matchIndex) - matchEndIdx = matchIndex + (U32)matchLength; - bestLength = matchLength; -- matches[mnum].off = STORE_OFFSET(curr - matchIndex); -+ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); - matches[mnum].len = (U32)matchLength; - mnum++; - if ( (matchLength > ZSTD_OPT_NUM) -@@ -792,7 +828,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)( - U32 const ll0, - U32 const lengthToBeat); - --FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( -+FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+U32 ZSTD_btGetAllMatches_internal( - ZSTD_match_t* matches, - ZSTD_matchState_t* ms, - U32* nextToUpdate3, -@@ -960,7 +998,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, - const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) - { - U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; -- /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ -+ /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */ - U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; - - /* Ensure that current block position is not outside of the match */ -@@ -971,11 +1009,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, - } - - if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { -- U32 const candidateOffCode = STORE_OFFSET(optLdm->offset); -- DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u", -- candidateOffCode, candidateMatchLength, currPosInBlock); -+ U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset); -+ DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u", -+ candidateOffBase, candidateMatchLength, currPosInBlock); - matches[*nbMatches].len = candidateMatchLength; -- matches[*nbMatches].off = candidateOffCode; -+ matches[*nbMatches].off = candidateOffBase; - (*nbMatches)++; - } - } -@@ -1011,11 +1049,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, - * Optimal parser - *********************************/ - --static U32 ZSTD_totalLen(ZSTD_optimal_t sol) --{ -- return sol.litlen + sol.mlen; --} -- - #if 0 /* debug */ - - static void -@@ -1033,7 +1066,13 @@ listStats(const U32* table, int lastEltID) - - #endif - --FORCE_INLINE_TEMPLATE size_t -+#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel) -+#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel) -+#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1)) -+ -+FORCE_INLINE_TEMPLATE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t - ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, - seqStore_t* seqStore, - U32 rep[ZSTD_REP_NUM], -@@ -1059,9 +1098,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, - - ZSTD_optimal_t* const opt = optStatePtr->priceTable; - ZSTD_match_t* const matches = optStatePtr->matchTable; -- ZSTD_optimal_t lastSequence; -+ ZSTD_optimal_t lastStretch; - ZSTD_optLdm_t optLdm; - -+ ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t)); -+ - optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; - optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; - ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); -@@ -1082,103 +1123,139 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, - U32 const ll0 = !litlen; - U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); - ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, -- (U32)(ip-istart), (U32)(iend - ip)); -- if (!nbMatches) { ip++; continue; } -+ (U32)(ip-istart), (U32)(iend-ip)); -+ if (!nbMatches) { -+ DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart)); -+ ip++; -+ continue; -+ } -+ -+ /* Match found: let's store this solution, and eventually find more candidates. -+ * During this forward pass, @opt is used to store stretches, -+ * defined as "a match followed by N literals". -+ * Note how this is different from a Sequence, which is "N literals followed by a match". -+ * Storing stretches allows us to store different match predecessors -+ * for each literal position part of a literals run. */ - - /* initialize opt[0] */ -- { U32 i ; for (i=0; i immediate encoding */ - { U32 const maxML = matches[nbMatches-1].len; -- U32 const maxOffcode = matches[nbMatches-1].off; -- DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", -- nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart)); -+ U32 const maxOffBase = matches[nbMatches-1].off; -+ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series", -+ nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart)); - - if (maxML > sufficient_len) { -- lastSequence.litlen = litlen; -- lastSequence.mlen = maxML; -- lastSequence.off = maxOffcode; -- DEBUGLOG(6, "large match (%u>%u), immediate encoding", -+ lastStretch.litlen = 0; -+ lastStretch.mlen = maxML; -+ lastStretch.off = maxOffBase; -+ DEBUGLOG(6, "large match (%u>%u) => immediate encoding", - maxML, sufficient_len); - cur = 0; -- last_pos = ZSTD_totalLen(lastSequence); -+ last_pos = maxML; - goto _shortestPath; - } } - - /* set prices for first matches starting position == 0 */ - assert(opt[0].price >= 0); -- { U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); -- U32 pos; -+ { U32 pos; - U32 matchNb; - for (pos = 1; pos < minMatch; pos++) { -- opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ -+ opt[pos].price = ZSTD_MAX_PRICE; -+ opt[pos].mlen = 0; -+ opt[pos].litlen = litlen + pos; - } - for (matchNb = 0; matchNb < nbMatches; matchNb++) { -- U32 const offcode = matches[matchNb].off; -+ U32 const offBase = matches[matchNb].off; - U32 const end = matches[matchNb].len; - for ( ; pos <= end ; pos++ ) { -- U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel); -- U32 const sequencePrice = literalsPrice + matchPrice; -+ int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); -+ int const sequencePrice = opt[0].price + matchPrice; - DEBUGLOG(7, "rPos:%u => set initial price : %.2f", - pos, ZSTD_fCost(sequencePrice)); - opt[pos].mlen = pos; -- opt[pos].off = offcode; -- opt[pos].litlen = litlen; -- opt[pos].price = (int)sequencePrice; -- } } -+ opt[pos].off = offBase; -+ opt[pos].litlen = 0; /* end of match */ -+ opt[pos].price = sequencePrice + LL_PRICE(0); -+ } -+ } - last_pos = pos-1; -+ opt[pos].price = ZSTD_MAX_PRICE; - } - } - - /* check further positions */ - for (cur = 1; cur <= last_pos; cur++) { - const BYTE* const inr = ip + cur; -- assert(cur < ZSTD_OPT_NUM); -- DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur) -+ assert(cur <= ZSTD_OPT_NUM); -+ DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur); - - /* Fix current position with one literal if cheaper */ -- { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; -+ { U32 const litlen = opt[cur-1].litlen + 1; - int const price = opt[cur-1].price -- + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) -- + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) -- - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); -+ + LIT_PRICE(ip+cur-1) -+ + LL_INCPRICE(litlen); - assert(price < 1000000000); /* overflow check */ - if (price <= opt[cur].price) { -+ ZSTD_optimal_t const prevMatch = opt[cur]; - DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", - inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, - opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); -- opt[cur].mlen = 0; -- opt[cur].off = 0; -+ opt[cur] = opt[cur-1]; - opt[cur].litlen = litlen; - opt[cur].price = price; -+ if ( (optLevel >= 1) /* additional check only for higher modes */ -+ && (prevMatch.litlen == 0) /* replace a match */ -+ && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */ -+ && LIKELY(ip + cur < iend) -+ ) { -+ /* check next position, in case it would be cheaper */ -+ int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1); -+ int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1); -+ DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f", -+ cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals)); -+ if ( (with1literal < withMoreLiterals) -+ && (with1literal < opt[cur+1].price) ) { -+ /* update offset history - before it disappears */ -+ U32 const prev = cur - prevMatch.mlen; -+ repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0); -+ assert(cur >= prevMatch.mlen); -+ DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !", -+ ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals), -+ newReps.rep[0], newReps.rep[1], newReps.rep[2] ); -+ opt[cur+1] = prevMatch; /* mlen & offbase */ -+ ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t)); -+ opt[cur+1].litlen = 1; -+ opt[cur+1].price = with1literal; -+ if (last_pos < cur+1) last_pos = cur+1; -+ } -+ } - } else { -- DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)", -- inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), -- opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]); -+ DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)", -+ inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price)); - } - } - -- /* Set the repcodes of the current position. We must do it here -- * because we rely on the repcodes of the 2nd to last sequence being -- * correct to set the next chunks repcodes during the backward -- * traversal. -+ /* Offset history is not updated during match comparison. -+ * Do it here, now that the match is selected and confirmed. - */ - ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t)); - assert(cur >= opt[cur].mlen); -- if (opt[cur].mlen != 0) { -+ if (opt[cur].litlen == 0) { -+ /* just finished a match => alter offset history */ - U32 const prev = cur - opt[cur].mlen; -- repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); -+ repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0); - ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); -- } else { -- ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); - } - - /* last match must start at a minimum distance of 8 from oend */ -@@ -1188,15 +1265,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, - - if ( (optLevel==0) /*static_test*/ - && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { -- DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1); -+ DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1); - continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ - } - - assert(opt[cur].price >= 0); -- { U32 const ll0 = (opt[cur].mlen != 0); -- U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; -- U32 const previousPrice = (U32)opt[cur].price; -- U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); -+ { U32 const ll0 = (opt[cur].litlen == 0); -+ int const previousPrice = opt[cur].price; -+ int const basePrice = previousPrice + LL_PRICE(0); - U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); - U32 matchNb; - -@@ -1208,18 +1284,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, - continue; - } - -- { U32 const maxML = matches[nbMatches-1].len; -- DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u", -- inr-istart, cur, nbMatches, maxML); -- -- if ( (maxML > sufficient_len) -- || (cur + maxML >= ZSTD_OPT_NUM) ) { -- lastSequence.mlen = maxML; -- lastSequence.off = matches[nbMatches-1].off; -- lastSequence.litlen = litlen; -- cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */ -- last_pos = cur + ZSTD_totalLen(lastSequence); -- if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */ -+ { U32 const longestML = matches[nbMatches-1].len; -+ DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u", -+ inr-istart, cur, nbMatches, longestML); -+ -+ if ( (longestML > sufficient_len) -+ || (cur + longestML >= ZSTD_OPT_NUM) -+ || (ip + cur + longestML >= iend) ) { -+ lastStretch.mlen = longestML; -+ lastStretch.off = matches[nbMatches-1].off; -+ lastStretch.litlen = 0; -+ last_pos = cur + longestML; - goto _shortestPath; - } } - -@@ -1230,20 +1305,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, - U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; - U32 mlen; - -- DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u", -- matchNb, matches[matchNb].off, lastML, litlen); -+ DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u", -+ matchNb, matches[matchNb].off, lastML, opt[cur].litlen); - - for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ - U32 const pos = cur + mlen; -- int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); -+ int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); - - if ((pos > last_pos) || (price < opt[pos].price)) { - DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", - pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); -- while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */ -+ while (last_pos < pos) { -+ /* fill empty positions, for future comparisons */ -+ last_pos++; -+ opt[last_pos].price = ZSTD_MAX_PRICE; -+ opt[last_pos].litlen = !0; /* just needs to be != 0, to mean "not an end of match" */ -+ } - opt[pos].mlen = mlen; - opt[pos].off = offset; -- opt[pos].litlen = litlen; -+ opt[pos].litlen = 0; - opt[pos].price = price; - } else { - DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", -@@ -1251,52 +1331,86 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, - if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ - } - } } } -+ opt[last_pos+1].price = ZSTD_MAX_PRICE; - } /* for (cur = 1; cur <= last_pos; cur++) */ - -- lastSequence = opt[last_pos]; -- cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */ -- assert(cur < ZSTD_OPT_NUM); /* control overflow*/ -+ lastStretch = opt[last_pos]; -+ assert(cur >= lastStretch.mlen); -+ cur = last_pos - lastStretch.mlen; - - _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ - assert(opt[0].mlen == 0); -+ assert(last_pos >= lastStretch.mlen); -+ assert(cur == last_pos - lastStretch.mlen); - -- /* Set the next chunk's repcodes based on the repcodes of the beginning -- * of the last match, and the last sequence. This avoids us having to -- * update them while traversing the sequences. -- */ -- if (lastSequence.mlen != 0) { -- repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); -- ZSTD_memcpy(rep, &reps, sizeof(reps)); -+ if (lastStretch.mlen==0) { -+ /* no solution : all matches have been converted into literals */ -+ assert(lastStretch.litlen == (ip - anchor) + last_pos); -+ ip += last_pos; -+ continue; -+ } -+ assert(lastStretch.off > 0); -+ -+ /* Update offset history */ -+ if (lastStretch.litlen == 0) { -+ /* finishing on a match : update offset history */ -+ repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0); -+ ZSTD_memcpy(rep, &reps, sizeof(repcodes_t)); - } else { -- ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); -+ ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t)); -+ assert(cur >= lastStretch.litlen); -+ cur -= lastStretch.litlen; - } - -- { U32 const storeEnd = cur + 1; -+ /* Let's write the shortest path solution. -+ * It is stored in @opt in reverse order, -+ * starting from @storeEnd (==cur+2), -+ * effectively partially @opt overwriting. -+ * Content is changed too: -+ * - So far, @opt stored stretches, aka a match followed by literals -+ * - Now, it will store sequences, aka literals followed by a match -+ */ -+ { U32 const storeEnd = cur + 2; - U32 storeStart = storeEnd; -- U32 seqPos = cur; -+ U32 stretchPos = cur; - - DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)", - last_pos, cur); (void)last_pos; -- assert(storeEnd < ZSTD_OPT_NUM); -- DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", -- storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off); -- opt[storeEnd] = lastSequence; -- while (seqPos > 0) { -- U32 const backDist = ZSTD_totalLen(opt[seqPos]); -+ assert(storeEnd < ZSTD_OPT_SIZE); -+ DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", -+ storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off); -+ if (lastStretch.litlen > 0) { -+ /* last "sequence" is unfinished: just a bunch of literals */ -+ opt[storeEnd].litlen = lastStretch.litlen; -+ opt[storeEnd].mlen = 0; -+ storeStart = storeEnd-1; -+ opt[storeStart] = lastStretch; -+ } { -+ opt[storeEnd] = lastStretch; /* note: litlen will be fixed */ -+ storeStart = storeEnd; -+ } -+ while (1) { -+ ZSTD_optimal_t nextStretch = opt[stretchPos]; -+ opt[storeStart].litlen = nextStretch.litlen; -+ DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)", -+ opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off); -+ if (nextStretch.mlen == 0) { -+ /* reaching beginning of segment */ -+ break; -+ } - storeStart--; -- DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", -- seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off); -- opt[storeStart] = opt[seqPos]; -- seqPos = (seqPos > backDist) ? seqPos - backDist : 0; -+ opt[storeStart] = nextStretch; /* note: litlen will be fixed */ -+ assert(nextStretch.litlen + nextStretch.mlen <= stretchPos); -+ stretchPos -= nextStretch.litlen + nextStretch.mlen; - } - - /* save sequences */ -- DEBUGLOG(6, "sending selected sequences into seqStore") -+ DEBUGLOG(6, "sending selected sequences into seqStore"); - { U32 storePos; - for (storePos=storeStart; storePos <= storeEnd; storePos++) { - U32 const llen = opt[storePos].litlen; - U32 const mlen = opt[storePos].mlen; -- U32 const offCode = opt[storePos].off; -+ U32 const offBase = opt[storePos].off; - U32 const advance = llen + mlen; - DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", - anchor - istart, (unsigned)llen, (unsigned)mlen); -@@ -1308,11 +1422,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, - } - - assert(anchor + llen <= iend); -- ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); -- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen); -+ ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen); -+ ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen); - anchor += advance; - ip = anchor; - } } -+ DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]); -+ -+ /* update all costs */ - ZSTD_setBasePrices(optStatePtr, optLevel); - } - } /* while (ip < ilimit) */ -@@ -1320,21 +1437,27 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, - /* Return the last literals size */ - return (size_t)(iend - anchor); - } -+#endif /* build exclusions */ - -+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR - static size_t ZSTD_compressBlock_opt0( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) - { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); - } -+#endif - -+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR - static size_t ZSTD_compressBlock_opt2( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) - { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); - } -+#endif - -+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR - size_t ZSTD_compressBlock_btopt( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - const void* src, size_t srcSize) -@@ -1342,20 +1465,23 @@ size_t ZSTD_compressBlock_btopt( - DEBUGLOG(5, "ZSTD_compressBlock_btopt"); - return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); - } -+#endif - - - - -+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR - /* ZSTD_initStats_ultra(): - * make a first compression pass, just to seed stats with more accurate starting values. - * only works on first block, with no dictionary and no ldm. -- * this function cannot error, hence its contract must be respected. -+ * this function cannot error out, its narrow contract must be respected. - */ --static void --ZSTD_initStats_ultra(ZSTD_matchState_t* ms, -- seqStore_t* seqStore, -- U32 rep[ZSTD_REP_NUM], -- const void* src, size_t srcSize) -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+void ZSTD_initStats_ultra(ZSTD_matchState_t* ms, -+ seqStore_t* seqStore, -+ U32 rep[ZSTD_REP_NUM], -+ const void* src, size_t srcSize) - { - U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ - ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); -@@ -1368,7 +1494,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, - - ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ - -- /* invalidate first scan from history */ -+ /* invalidate first scan from history, only keep entropy stats */ - ZSTD_resetSeqStore(seqStore); - ms->window.base -= srcSize; - ms->window.dictLimit += (U32)srcSize; -@@ -1392,10 +1518,10 @@ size_t ZSTD_compressBlock_btultra2( - U32 const curr = (U32)((const BYTE*)src - ms->window.base); - DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); - -- /* 2-pass strategy: -+ /* 2-passes strategy: - * this strategy makes a first pass over first block to collect statistics -- * and seed next round's statistics with it. -- * After 1st pass, function forgets everything, and starts a new block. -+ * in order to seed next round's statistics with it. -+ * After 1st pass, function forgets history, and starts a new block. - * Consequently, this can only work if no data has been previously loaded in tables, - * aka, no dictionary, no prefix, no ldm preprocessing. - * The compression ratio gain is generally small (~0.5% on first block), -@@ -1404,15 +1530,17 @@ size_t ZSTD_compressBlock_btultra2( - if ( (ms->opt.litLengthSum==0) /* first block */ - && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ - && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ -- && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ -- && (srcSize > ZSTD_PREDEF_THRESHOLD) -+ && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ -+ && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */ - ) { - ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); - } - - return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); - } -+#endif - -+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR - size_t ZSTD_compressBlock_btopt_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - const void* src, size_t srcSize) -@@ -1420,18 +1548,20 @@ size_t ZSTD_compressBlock_btopt_dictMatchState( - return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); - } - --size_t ZSTD_compressBlock_btultra_dictMatchState( -+size_t ZSTD_compressBlock_btopt_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - const void* src, size_t srcSize) - { -- return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); -+ return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); - } -+#endif - --size_t ZSTD_compressBlock_btopt_extDict( -+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_btultra_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - const void* src, size_t srcSize) - { -- return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); -+ return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); - } - - size_t ZSTD_compressBlock_btultra_extDict( -@@ -1440,6 +1570,7 @@ size_t ZSTD_compressBlock_btultra_extDict( - { - return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); - } -+#endif - - /* note : no btultra2 variant for extDict nor dictMatchState, - * because btultra2 is not meant to work with dictionaries -diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h -index 22b862858ba7..ac1b743d27cd 100644 ---- a/lib/zstd/compress/zstd_opt.h -+++ b/lib/zstd/compress/zstd_opt.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -14,30 +15,40 @@ - - #include "zstd_compress_internal.h" - -+#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ -+ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) - /* used in ZSTD_loadDictionaryContent() */ - void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); -+#endif - -+#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR - size_t ZSTD_compressBlock_btopt( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_btultra( -+size_t ZSTD_compressBlock_btopt_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); --size_t ZSTD_compressBlock_btultra2( -+size_t ZSTD_compressBlock_btopt_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); - -+#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt -+#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState -+#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict -+#else -+#define ZSTD_COMPRESSBLOCK_BTOPT NULL -+#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL -+#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL -+#endif - --size_t ZSTD_compressBlock_btopt_dictMatchState( -+#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR -+size_t ZSTD_compressBlock_btultra( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); - size_t ZSTD_compressBlock_btultra_dictMatchState( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); -- --size_t ZSTD_compressBlock_btopt_extDict( -- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], -- void const* src, size_t srcSize); - size_t ZSTD_compressBlock_btultra_extDict( - ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize); -@@ -45,6 +56,20 @@ size_t ZSTD_compressBlock_btultra_extDict( - /* note : no btultra2 variant for extDict nor dictMatchState, - * because btultra2 is not meant to work with dictionaries - * and is only specific for the first block (no prefix) */ -+size_t ZSTD_compressBlock_btultra2( -+ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], -+ void const* src, size_t srcSize); -+ -+#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra -+#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState -+#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict -+#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2 -+#else -+#define ZSTD_COMPRESSBLOCK_BTULTRA NULL -+#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL -+#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL -+#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL -+#endif - - - #endif /* ZSTD_OPT_H */ -diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c -index 60958afebc41..ac8b87f48f84 100644 ---- a/lib/zstd/decompress/huf_decompress.c -+++ b/lib/zstd/decompress/huf_decompress.c -@@ -1,7 +1,8 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* ****************************************************************** - * huff0 huffman decoder, - * part of Finite State Entropy library -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * - * You can contact the author at : - * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy -@@ -19,10 +20,10 @@ - #include "../common/compiler.h" - #include "../common/bitstream.h" /* BIT_* */ - #include "../common/fse.h" /* to compress headers */ --#define HUF_STATIC_LINKING_ONLY - #include "../common/huf.h" - #include "../common/error_private.h" - #include "../common/zstd_internal.h" -+#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ - - /* ************************************************************** - * Constants -@@ -34,6 +35,12 @@ - * Macros - ****************************************************************/ - -+#ifdef HUF_DISABLE_FAST_DECODE -+# define HUF_ENABLE_FAST_DECODE 0 -+#else -+# define HUF_ENABLE_FAST_DECODE 1 -+#endif -+ - /* These two optional macros force the use one way or another of the two - * Huffman decompression implementations. You can't force in both directions - * at the same time. -@@ -43,27 +50,25 @@ - #error "Cannot force the use of the X1 and X2 decoders at the same time!" - #endif - --#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 --# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE -+/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is -+ * supported at runtime, so we can add the BMI2 target attribute. -+ * When it is disabled, we will still get BMI2 if it is enabled statically. -+ */ -+#if DYNAMIC_BMI2 -+# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE - #else --# define HUF_ASM_X86_64_BMI2_ATTRS -+# define HUF_FAST_BMI2_ATTRS - #endif - - #define HUF_EXTERN_C - #define HUF_ASM_DECL HUF_EXTERN_C - --#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) -+#if DYNAMIC_BMI2 - # define HUF_NEED_BMI2_FUNCTION 1 - #else - # define HUF_NEED_BMI2_FUNCTION 0 - #endif - --#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) --# define HUF_NEED_DEFAULT_FUNCTION 1 --#else --# define HUF_NEED_DEFAULT_FUNCTION 0 --#endif -- - /* ************************************************************** - * Error Management - ****************************************************************/ -@@ -80,6 +85,11 @@ - /* ************************************************************** - * BMI2 Variant Wrappers - ****************************************************************/ -+typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, -+ const void *cSrc, -+ size_t cSrcSize, -+ const HUF_DTable *DTable); -+ - #if DYNAMIC_BMI2 - - #define HUF_DGEN(fn) \ -@@ -101,9 +111,9 @@ - } \ - \ - static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ -- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ -+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ - { \ -- if (bmi2) { \ -+ if (flags & HUF_flags_bmi2) { \ - return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ - } \ - return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ -@@ -113,9 +123,9 @@ - - #define HUF_DGEN(fn) \ - static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ -- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ -+ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ - { \ -- (void)bmi2; \ -+ (void)flags; \ - return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ - } - -@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) - return dtd; - } - --#if ZSTD_ENABLE_ASM_X86_64_BMI2 -- --static size_t HUF_initDStream(BYTE const* ip) { -+static size_t HUF_initFastDStream(BYTE const* ip) { - BYTE const lastByte = ip[7]; -- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; -+ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; - size_t const value = MEM_readLEST(ip) | 1; - assert(bitsConsumed <= 8); -+ assert(sizeof(size_t) == 8); - return value << bitsConsumed; - } -+ -+ -+/* -+ * The input/output arguments to the Huffman fast decoding loop: -+ * -+ * ip [in/out] - The input pointers, must be updated to reflect what is consumed. -+ * op [in/out] - The output pointers, must be updated to reflect what is written. -+ * bits [in/out] - The bitstream containers, must be updated to reflect the current state. -+ * dt [in] - The decoding table. -+ * ilowest [in] - The beginning of the valid range of the input. Decoders may read -+ * down to this pointer. It may be below iend[0]. -+ * oend [in] - The end of the output stream. op[3] must not cross oend. -+ * iend [in] - The end of each input stream. ip[i] may cross iend[i], -+ * as long as it is above ilowest, but that indicates corruption. -+ */ - typedef struct { - BYTE const* ip[4]; - BYTE* op[4]; - U64 bits[4]; - void const* dt; -- BYTE const* ilimit; -+ BYTE const* ilowest; - BYTE* oend; - BYTE const* iend[4]; --} HUF_DecompressAsmArgs; -+} HUF_DecompressFastArgs; -+ -+typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); - - /* -- * Initializes args for the asm decoding loop. -- * @returns 0 on success -- * 1 if the fallback implementation should be used. -+ * Initializes args for the fast decoding loop. -+ * @returns 1 on success -+ * 0 if the fallback implementation should be used. - * Or an error code on failure. - */ --static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) -+static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) - { - void const* dt = DTable + 1; - U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; - -- const BYTE* const ilimit = (const BYTE*)src + 6 + 8; -+ const BYTE* const istart = (const BYTE*)src; - -- BYTE* const oend = (BYTE*)dst + dstSize; -+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); - -- /* The following condition is false on x32 platform, -- * but HUF_asm is not compatible with this ABI */ -- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; -+ /* The fast decoding loop assumes 64-bit little-endian. -+ * This condition is false on x32. -+ */ -+ if (!MEM_isLittleEndian() || MEM_32bits()) -+ return 0; -+ -+ /* Avoid nullptr addition */ -+ if (dstSize == 0) -+ return 0; -+ assert(dst != NULL); - - /* strict minimum : jump table + 1 byte per stream */ - if (srcSize < 10) -@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, - * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. - */ - if (dtLog != HUF_DECODER_FAST_TABLELOG) -- return 1; -+ return 0; - - /* Read the jump table. */ - { -- const BYTE* const istart = (const BYTE*)src; - size_t const length1 = MEM_readLE16(istart); - size_t const length2 = MEM_readLE16(istart+2); - size_t const length3 = MEM_readLE16(istart+4); -@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, - args->iend[2] = args->iend[1] + length2; - args->iend[3] = args->iend[2] + length3; - -- /* HUF_initDStream() requires this, and this small of an input -+ /* HUF_initFastDStream() requires this, and this small of an input - * won't benefit from the ASM loop anyways. -- * length1 must be >= 16 so that ip[0] >= ilimit before the loop -- * starts. - */ -- if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) -- return 1; -+ if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8) -+ return 0; - if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ - } - /* ip[] contains the position that is currently loaded into bits[]. */ -@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, - - /* No point to call the ASM loop for tiny outputs. */ - if (args->op[3] >= oend) -- return 1; -+ return 0; - - /* bits[] is the bit container. - * It is read from the MSB down to the LSB. -@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, - * set, so that CountTrailingZeros(bits[]) can be used - * to count how many bits we've consumed. - */ -- args->bits[0] = HUF_initDStream(args->ip[0]); -- args->bits[1] = HUF_initDStream(args->ip[1]); -- args->bits[2] = HUF_initDStream(args->ip[2]); -- args->bits[3] = HUF_initDStream(args->ip[3]); -- -- /* If ip[] >= ilimit, it is guaranteed to be safe to -- * reload bits[]. It may be beyond its section, but is -- * guaranteed to be valid (>= istart). -- */ -- args->ilimit = ilimit; -+ args->bits[0] = HUF_initFastDStream(args->ip[0]); -+ args->bits[1] = HUF_initFastDStream(args->ip[1]); -+ args->bits[2] = HUF_initFastDStream(args->ip[2]); -+ args->bits[3] = HUF_initFastDStream(args->ip[3]); -+ -+ /* The decoders must be sure to never read beyond ilowest. -+ * This is lower than iend[0], but allowing decoders to read -+ * down to ilowest can allow an extra iteration or two in the -+ * fast loop. -+ */ -+ args->ilowest = istart; - - args->oend = oend; - args->dt = dt; - -- return 0; -+ return 1; - } - --static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) -+static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) - { - /* Validate that we haven't overwritten. */ - if (args->op[stream] > segmentEnd) -@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs - return ERROR(corruption_detected); - - /* Construct the BIT_DStream_t. */ -- bit->bitContainer = MEM_readLE64(args->ip[stream]); -- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); -- bit->start = (const char*)args->iend[0]; -+ assert(sizeof(size_t) == 8); -+ bit->bitContainer = MEM_readLEST(args->ip[stream]); -+ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); -+ bit->start = (const char*)args->ilowest; - bit->limitPtr = bit->start + sizeof(size_t); - bit->ptr = (const char*)args->ip[stream]; - - return 0; - } --#endif -+ -+/* Calls X(N) for each stream 0, 1, 2, 3. */ -+#define HUF_4X_FOR_EACH_STREAM(X) \ -+ do { \ -+ X(0); \ -+ X(1); \ -+ X(2); \ -+ X(3); \ -+ } while (0) -+ -+/* Calls X(N, var) for each stream 0, 1, 2, 3. */ -+#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ -+ do { \ -+ X(0, (var)); \ -+ X(1, (var)); \ -+ X(2, (var)); \ -+ X(3, (var)); \ -+ } while (0) - - - #ifndef HUF_FORCE_DECOMPRESS_X2 -@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi - static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { - U64 D4; - if (MEM_isLittleEndian()) { -- D4 = (symbol << 8) + nbBits; -+ D4 = (U64)((symbol << 8) + nbBits); - } else { -- D4 = symbol + (nbBits << 8); -+ D4 = (U64)(symbol + (nbBits << 8)); - } -+ assert(D4 < (1U << 16)); - D4 *= 0x0001000100010001ULL; - return D4; - } -@@ -329,13 +379,7 @@ typedef struct { - BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; - } HUF_ReadDTableX1_Workspace; - -- --size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) --{ -- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); --} -- --size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2) -+size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) - { - U32 tableLog = 0; - U32 nbSymbols = 0; -@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr - DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); - /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ - -- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); -+ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); - if (HUF_isError(iSize)) return iSize; - - -@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr - * rankStart[0] is not filled because there are no entries in the table for - * weight 0. - */ -- { -- int n; -- int nextRankStart = 0; -+ { int n; -+ U32 nextRankStart = 0; - int const unroll = 4; - int const nLimit = (int)nbSymbols - unroll + 1; - for (n=0; n<(int)tableLog+1; n++) { -@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr - * We can switch based on the length to a different inner loop which is - * optimized for that particular case. - */ -- { -- U32 w; -- int symbol=wksp->rankVal[0]; -- int rankStart=0; -+ { U32 w; -+ int symbol = wksp->rankVal[0]; -+ int rankStart = 0; - for (w=1; wrankVal[w]; - int const length = (1 << w) >> 1; -@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog - } - - #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ -- *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog) -+ do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0) - --#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ -- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ -- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) -+#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ -+ do { \ -+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ -+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ -+ } while (0) - --#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ -- if (MEM_64bits()) \ -- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) -+#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ -+ do { \ -+ if (MEM_64bits()) \ -+ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ -+ } while (0) - - HINT_INLINE size_t - HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) -@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons - while (p < pEnd) - HUF_DECODE_SYMBOLX1_0(p, bitDPtr); - -- return pEnd-pStart; -+ return (size_t)(pEnd-pStart); - } - - FORCE_INLINE_TEMPLATE size_t -@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body( - const HUF_DTable* DTable) - { - BYTE* op = (BYTE*)dst; -- BYTE* const oend = op + dstSize; -+ BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize); - const void* dtPtr = DTable + 1; - const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; - BIT_DStream_t bitD; -@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body( - return dstSize; - } - -+/* HUF_decompress4X1_usingDTable_internal_body(): -+ * Conditions : -+ * @dstSize >= 6 -+ */ - FORCE_INLINE_TEMPLATE size_t - HUF_decompress4X1_usingDTable_internal_body( - void* dst, size_t dstSize, -@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body( - { - /* Check */ - if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ -+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ - - { const BYTE* const istart = (const BYTE*) cSrc; - BYTE* const ostart = (BYTE*) dst; -@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body( - - if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ - if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ -+ assert(dstSize >= 6); /* validated above */ - CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); - CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); - CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); -@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo - } - #endif - --#if HUF_NEED_DEFAULT_FUNCTION - static - size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, - size_t cSrcSize, HUF_DTable const* DTable) { - return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); - } --#endif - - #if ZSTD_ENABLE_ASM_X86_64_BMI2 - --HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; -+HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; -+ -+#endif -+ -+static HUF_FAST_BMI2_ATTRS -+void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) -+{ -+ U64 bits[4]; -+ BYTE const* ip[4]; -+ BYTE* op[4]; -+ U16 const* const dtable = (U16 const*)args->dt; -+ BYTE* const oend = args->oend; -+ BYTE const* const ilowest = args->ilowest; -+ -+ /* Copy the arguments to local variables */ -+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); -+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); -+ ZSTD_memcpy(&op, &args->op, sizeof(op)); -+ -+ assert(MEM_isLittleEndian()); -+ assert(!MEM_32bits()); -+ -+ for (;;) { -+ BYTE* olimit; -+ int stream; -+ -+ /* Assert loop preconditions */ -+#ifndef NDEBUG -+ for (stream = 0; stream < 4; ++stream) { -+ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); -+ assert(ip[stream] >= ilowest); -+ } -+#endif -+ /* Compute olimit */ -+ { -+ /* Each iteration produces 5 output symbols per stream */ -+ size_t const oiters = (size_t)(oend - op[3]) / 5; -+ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes -+ * per stream. -+ */ -+ size_t const iiters = (size_t)(ip[0] - ilowest) / 7; -+ /* We can safely run iters iterations before running bounds checks */ -+ size_t const iters = MIN(oiters, iiters); -+ size_t const symbols = iters * 5; -+ -+ /* We can simply check that op[3] < olimit, instead of checking all -+ * of our bounds, since we can't hit the other bounds until we've run -+ * iters iterations, which only happens when op[3] == olimit. -+ */ -+ olimit = op[3] + symbols; -+ -+ /* Exit fast decoding loop once we reach the end. */ -+ if (op[3] == olimit) -+ break; -+ -+ /* Exit the decoding loop if any input pointer has crossed the -+ * previous one. This indicates corruption, and a precondition -+ * to our loop is that ip[i] >= ip[0]. -+ */ -+ for (stream = 1; stream < 4; ++stream) { -+ if (ip[stream] < ip[stream - 1]) -+ goto _out; -+ } -+ } -+ -+#ifndef NDEBUG -+ for (stream = 1; stream < 4; ++stream) { -+ assert(ip[stream] >= ip[stream - 1]); -+ } -+#endif -+ -+#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ -+ do { \ -+ int const index = (int)(bits[(_stream)] >> 53); \ -+ int const entry = (int)dtable[index]; \ -+ bits[(_stream)] <<= (entry & 0x3F); \ -+ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ -+ } while (0) -+ -+#define HUF_4X1_RELOAD_STREAM(_stream) \ -+ do { \ -+ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ -+ int const nbBits = ctz & 7; \ -+ int const nbBytes = ctz >> 3; \ -+ op[(_stream)] += 5; \ -+ ip[(_stream)] -= nbBytes; \ -+ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ -+ bits[(_stream)] <<= nbBits; \ -+ } while (0) -+ -+ /* Manually unroll the loop because compilers don't consistently -+ * unroll the inner loops, which destroys performance. -+ */ -+ do { -+ /* Decode 5 symbols in each of the 4 streams */ -+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0); -+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1); -+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2); -+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3); -+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); -+ -+ /* Reload each of the 4 the bitstreams */ -+ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); -+ } while (op[3] < olimit); -+ -+#undef HUF_4X1_DECODE_SYMBOL -+#undef HUF_4X1_RELOAD_STREAM -+ } - --static HUF_ASM_X86_64_BMI2_ATTRS -+_out: -+ -+ /* Save the final values of each of the state variables back to args. */ -+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); -+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); -+ ZSTD_memcpy(&args->op, &op, sizeof(op)); -+} -+ -+/* -+ * @returns @p dstSize on success (>= 6) -+ * 0 if the fallback implementation should be used -+ * An error if an error occurred -+ */ -+static HUF_FAST_BMI2_ATTRS - size_t --HUF_decompress4X1_usingDTable_internal_bmi2_asm( -+HUF_decompress4X1_usingDTable_internal_fast( - void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, -- const HUF_DTable* DTable) -+ const HUF_DTable* DTable, -+ HUF_DecompressFastLoopFn loopFn) - { - void const* dt = DTable + 1; -- const BYTE* const iend = (const BYTE*)cSrc + 6; -- BYTE* const oend = (BYTE*)dst + dstSize; -- HUF_DecompressAsmArgs args; -- { -- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); -- FORWARD_IF_ERROR(ret, "Failed to init asm args"); -- if (ret != 0) -- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); -+ BYTE const* const ilowest = (BYTE const*)cSrc; -+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); -+ HUF_DecompressFastArgs args; -+ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); -+ FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); -+ if (ret == 0) -+ return 0; - } - -- assert(args.ip[0] >= args.ilimit); -- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); -+ assert(args.ip[0] >= args.ilowest); -+ loopFn(&args); - -- /* Our loop guarantees that ip[] >= ilimit and that we haven't -+ /* Our loop guarantees that ip[] >= ilowest and that we haven't - * overwritten any op[]. - */ -- assert(args.ip[0] >= iend); -- assert(args.ip[1] >= iend); -- assert(args.ip[2] >= iend); -- assert(args.ip[3] >= iend); -+ assert(args.ip[0] >= ilowest); -+ assert(args.ip[0] >= ilowest); -+ assert(args.ip[1] >= ilowest); -+ assert(args.ip[2] >= ilowest); -+ assert(args.ip[3] >= ilowest); - assert(args.op[3] <= oend); -- (void)iend; -+ -+ assert(ilowest == args.ilowest); -+ assert(ilowest + 6 == args.iend[0]); -+ (void)ilowest; - - /* finish bit streams one by one. */ -- { -- size_t const segmentSize = (dstSize+3) / 4; -+ { size_t const segmentSize = (dstSize+3) / 4; - BYTE* segmentEnd = (BYTE*)dst; - int i; - for (i = 0; i < 4; ++i) { -@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( - } - - /* decoded size */ -+ assert(dstSize != 0); - return dstSize; - } --#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ -- --typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, -- const void *cSrc, -- size_t cSrcSize, -- const HUF_DTable *DTable); - - HUF_DGEN(HUF_decompress1X1_usingDTable_internal) - - static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, -- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) -+ size_t cSrcSize, HUF_DTable const* DTable, int flags) - { -+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; -+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; -+ - #if DYNAMIC_BMI2 -- if (bmi2) { -+ if (flags & HUF_flags_bmi2) { -+ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; - # if ZSTD_ENABLE_ASM_X86_64_BMI2 -- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); --# else -- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); -+ if (!(flags & HUF_flags_disableAsm)) { -+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; -+ } - # endif -+ } else { -+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); - } --#else -- (void)bmi2; - #endif - - #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) -- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); --#else -- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); -+ if (!(flags & HUF_flags_disableAsm)) { -+ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; -+ } - #endif --} -- -- --size_t HUF_decompress1X1_usingDTable( -- void* dst, size_t dstSize, -- const void* cSrc, size_t cSrcSize, -- const HUF_DTable* DTable) --{ -- DTableDesc dtd = HUF_getDTableDesc(DTable); -- if (dtd.tableType != 0) return ERROR(GENERIC); -- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); --} - --size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, -- const void* cSrc, size_t cSrcSize, -- void* workSpace, size_t wkspSize) --{ -- const BYTE* ip = (const BYTE*) cSrc; -- -- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); -- if (HUF_isError(hSize)) return hSize; -- if (hSize >= cSrcSize) return ERROR(srcSize_wrong); -- ip += hSize; cSrcSize -= hSize; -- -- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); --} -- -- --size_t HUF_decompress4X1_usingDTable( -- void* dst, size_t dstSize, -- const void* cSrc, size_t cSrcSize, -- const HUF_DTable* DTable) --{ -- DTableDesc dtd = HUF_getDTableDesc(DTable); -- if (dtd.tableType != 0) return ERROR(GENERIC); -- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); -+ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { -+ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); -+ if (ret != 0) -+ return ret; -+ } -+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); - } - --static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, -+static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, -- void* workSpace, size_t wkspSize, int bmi2) -+ void* workSpace, size_t wkspSize, int flags) - { - const BYTE* ip = (const BYTE*) cSrc; - -- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); -+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); - if (HUF_isError(hSize)) return hSize; - if (hSize >= cSrcSize) return ERROR(srcSize_wrong); - ip += hSize; cSrcSize -= hSize; - -- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); --} -- --size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, -- const void* cSrc, size_t cSrcSize, -- void* workSpace, size_t wkspSize) --{ -- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); -+ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); - } - -- - #endif /* HUF_FORCE_DECOMPRESS_X2 */ - - -@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 - - static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, - const sortedSymbol_t* sortedList, -- const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight, -+ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, - const U32 nbBitsBaseline) - { - U32* const rankVal = rankValOrigin[0]; -@@ -1040,14 +1175,7 @@ typedef struct { - - size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, - const void* src, size_t srcSize, -- void* workSpace, size_t wkspSize) --{ -- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); --} -- --size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, -- const void* src, size_t srcSize, -- void* workSpace, size_t wkspSize, int bmi2) -+ void* workSpace, size_t wkspSize, int flags) - { - U32 tableLog, maxW, nbSymbols; - DTableDesc dtd = HUF_getDTableDesc(DTable); -@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, - if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); - /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ - -- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); -+ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); - if (HUF_isError(iSize)) return iSize; - - /* check result */ -@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c - } - - #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ -- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) -+ do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0) - --#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ -- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ -- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) -+#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ -+ do { \ -+ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ -+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ -+ } while (0) - --#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ -- if (MEM_64bits()) \ -- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) -+#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ -+ do { \ -+ if (MEM_64bits()) \ -+ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ -+ } while (0) - - HINT_INLINE size_t - HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, -@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_body( - - /* decode */ - { BYTE* const ostart = (BYTE*) dst; -- BYTE* const oend = ostart + dstSize; -+ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize); - const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ - const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; - DTableDesc const dtd = HUF_getDTableDesc(DTable); -@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_body( - /* decoded size */ - return dstSize; - } -+ -+/* HUF_decompress4X2_usingDTable_internal_body(): -+ * Conditions: -+ * @dstSize >= 6 -+ */ - FORCE_INLINE_TEMPLATE size_t - HUF_decompress4X2_usingDTable_internal_body( - void* dst, size_t dstSize, -@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_body( - const HUF_DTable* DTable) - { - if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ -+ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ - - { const BYTE* const istart = (const BYTE*) cSrc; - BYTE* const ostart = (BYTE*) dst; -@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_body( - DTableDesc const dtd = HUF_getDTableDesc(DTable); - U32 const dtLog = dtd.tableLog; - -- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ -- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ -+ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ -+ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ -+ assert(dstSize >= 6 /* validated above */); - CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); - CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); - CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); -@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo - } - #endif - --#if HUF_NEED_DEFAULT_FUNCTION - static - size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, - size_t cSrcSize, HUF_DTable const* DTable) { - return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); - } --#endif - - #if ZSTD_ENABLE_ASM_X86_64_BMI2 - --HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; -+HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; -+ -+#endif -+ -+static HUF_FAST_BMI2_ATTRS -+void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) -+{ -+ U64 bits[4]; -+ BYTE const* ip[4]; -+ BYTE* op[4]; -+ BYTE* oend[4]; -+ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; -+ BYTE const* const ilowest = args->ilowest; -+ -+ /* Copy the arguments to local registers. */ -+ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); -+ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); -+ ZSTD_memcpy(&op, &args->op, sizeof(op)); -+ -+ oend[0] = op[1]; -+ oend[1] = op[2]; -+ oend[2] = op[3]; -+ oend[3] = args->oend; -+ -+ assert(MEM_isLittleEndian()); -+ assert(!MEM_32bits()); -+ -+ for (;;) { -+ BYTE* olimit; -+ int stream; -+ -+ /* Assert loop preconditions */ -+#ifndef NDEBUG -+ for (stream = 0; stream < 4; ++stream) { -+ assert(op[stream] <= oend[stream]); -+ assert(ip[stream] >= ilowest); -+ } -+#endif -+ /* Compute olimit */ -+ { -+ /* Each loop does 5 table lookups for each of the 4 streams. -+ * Each table lookup consumes up to 11 bits of input, and produces -+ * up to 2 bytes of output. -+ */ -+ /* We can consume up to 7 bytes of input per iteration per stream. -+ * We also know that each input pointer is >= ip[0]. So we can run -+ * iters loops before running out of input. -+ */ -+ size_t iters = (size_t)(ip[0] - ilowest) / 7; -+ /* Each iteration can produce up to 10 bytes of output per stream. -+ * Each output stream my advance at different rates. So take the -+ * minimum number of safe iterations among all the output streams. -+ */ -+ for (stream = 0; stream < 4; ++stream) { -+ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; -+ iters = MIN(iters, oiters); -+ } -+ -+ /* Each iteration produces at least 5 output symbols. So until -+ * op[3] crosses olimit, we know we haven't executed iters -+ * iterations yet. This saves us maintaining an iters counter, -+ * at the expense of computing the remaining # of iterations -+ * more frequently. -+ */ -+ olimit = op[3] + (iters * 5); -+ -+ /* Exit the fast decoding loop once we reach the end. */ -+ if (op[3] == olimit) -+ break; -+ -+ /* Exit the decoding loop if any input pointer has crossed the -+ * previous one. This indicates corruption, and a precondition -+ * to our loop is that ip[i] >= ip[0]. -+ */ -+ for (stream = 1; stream < 4; ++stream) { -+ if (ip[stream] < ip[stream - 1]) -+ goto _out; -+ } -+ } -+ -+#ifndef NDEBUG -+ for (stream = 1; stream < 4; ++stream) { -+ assert(ip[stream] >= ip[stream - 1]); -+ } -+#endif - --static HUF_ASM_X86_64_BMI2_ATTRS size_t --HUF_decompress4X2_usingDTable_internal_bmi2_asm( -+#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ -+ do { \ -+ if ((_decode3) || (_stream) != 3) { \ -+ int const index = (int)(bits[(_stream)] >> 53); \ -+ HUF_DEltX2 const entry = dtable[index]; \ -+ MEM_write16(op[(_stream)], entry.sequence); \ -+ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ -+ op[(_stream)] += (entry.length); \ -+ } \ -+ } while (0) -+ -+#define HUF_4X2_RELOAD_STREAM(_stream) \ -+ do { \ -+ HUF_4X2_DECODE_SYMBOL(3, 1); \ -+ { \ -+ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ -+ int const nbBits = ctz & 7; \ -+ int const nbBytes = ctz >> 3; \ -+ ip[(_stream)] -= nbBytes; \ -+ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ -+ bits[(_stream)] <<= nbBits; \ -+ } \ -+ } while (0) -+ -+ /* Manually unroll the loop because compilers don't consistently -+ * unroll the inner loops, which destroys performance. -+ */ -+ do { -+ /* Decode 5 symbols from each of the first 3 streams. -+ * The final stream will be decoded during the reload phase -+ * to reduce register pressure. -+ */ -+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); -+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); -+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); -+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); -+ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); -+ -+ /* Decode one symbol from the final stream */ -+ HUF_4X2_DECODE_SYMBOL(3, 1); -+ -+ /* Decode 4 symbols from the final stream & reload bitstreams. -+ * The final stream is reloaded last, meaning that all 5 symbols -+ * are decoded from the final stream before it is reloaded. -+ */ -+ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); -+ } while (op[3] < olimit); -+ } -+ -+#undef HUF_4X2_DECODE_SYMBOL -+#undef HUF_4X2_RELOAD_STREAM -+ -+_out: -+ -+ /* Save the final values of each of the state variables back to args. */ -+ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); -+ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); -+ ZSTD_memcpy(&args->op, &op, sizeof(op)); -+} -+ -+ -+static HUF_FAST_BMI2_ATTRS size_t -+HUF_decompress4X2_usingDTable_internal_fast( - void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, -- const HUF_DTable* DTable) { -+ const HUF_DTable* DTable, -+ HUF_DecompressFastLoopFn loopFn) { - void const* dt = DTable + 1; -- const BYTE* const iend = (const BYTE*)cSrc + 6; -- BYTE* const oend = (BYTE*)dst + dstSize; -- HUF_DecompressAsmArgs args; -+ const BYTE* const ilowest = (const BYTE*)cSrc; -+ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); -+ HUF_DecompressFastArgs args; - { -- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); -+ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); - FORWARD_IF_ERROR(ret, "Failed to init asm args"); -- if (ret != 0) -- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); -+ if (ret == 0) -+ return 0; - } - -- assert(args.ip[0] >= args.ilimit); -- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); -+ assert(args.ip[0] >= args.ilowest); -+ loopFn(&args); - - /* note : op4 already verified within main loop */ -- assert(args.ip[0] >= iend); -- assert(args.ip[1] >= iend); -- assert(args.ip[2] >= iend); -- assert(args.ip[3] >= iend); -+ assert(args.ip[0] >= ilowest); -+ assert(args.ip[1] >= ilowest); -+ assert(args.ip[2] >= ilowest); -+ assert(args.ip[3] >= ilowest); - assert(args.op[3] <= oend); -- (void)iend; -+ -+ assert(ilowest == args.ilowest); -+ assert(ilowest + 6 == args.iend[0]); -+ (void)ilowest; - - /* finish bitStreams one by one */ - { -@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm( - /* decoded size */ - return dstSize; - } --#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ - - static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, -- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) -+ size_t cSrcSize, HUF_DTable const* DTable, int flags) - { -+ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; -+ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; -+ - #if DYNAMIC_BMI2 -- if (bmi2) { -+ if (flags & HUF_flags_bmi2) { -+ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; - # if ZSTD_ENABLE_ASM_X86_64_BMI2 -- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); --# else -- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); -+ if (!(flags & HUF_flags_disableAsm)) { -+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; -+ } - # endif -+ } else { -+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); - } --#else -- (void)bmi2; - #endif - - #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) -- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); --#else -- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); -+ if (!(flags & HUF_flags_disableAsm)) { -+ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; -+ } - #endif -+ -+ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { -+ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); -+ if (ret != 0) -+ return ret; -+ } -+ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); - } - - HUF_DGEN(HUF_decompress1X2_usingDTable_internal) - --size_t HUF_decompress1X2_usingDTable( -- void* dst, size_t dstSize, -- const void* cSrc, size_t cSrcSize, -- const HUF_DTable* DTable) --{ -- DTableDesc dtd = HUF_getDTableDesc(DTable); -- if (dtd.tableType != 1) return ERROR(GENERIC); -- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); --} -- - size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, -- void* workSpace, size_t wkspSize) -+ void* workSpace, size_t wkspSize, int flags) - { - const BYTE* ip = (const BYTE*) cSrc; - - size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, -- workSpace, wkspSize); -+ workSpace, wkspSize, flags); - if (HUF_isError(hSize)) return hSize; - if (hSize >= cSrcSize) return ERROR(srcSize_wrong); - ip += hSize; cSrcSize -= hSize; - -- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); -+ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); - } - -- --size_t HUF_decompress4X2_usingDTable( -- void* dst, size_t dstSize, -- const void* cSrc, size_t cSrcSize, -- const HUF_DTable* DTable) --{ -- DTableDesc dtd = HUF_getDTableDesc(DTable); -- if (dtd.tableType != 1) return ERROR(GENERIC); -- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); --} -- --static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, -+static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, -- void* workSpace, size_t wkspSize, int bmi2) -+ void* workSpace, size_t wkspSize, int flags) - { - const BYTE* ip = (const BYTE*) cSrc; - - size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, -- workSpace, wkspSize); -+ workSpace, wkspSize, flags); - if (HUF_isError(hSize)) return hSize; - if (hSize >= cSrcSize) return ERROR(srcSize_wrong); - ip += hSize; cSrcSize -= hSize; - -- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); -+ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); - } - --size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, -- const void* cSrc, size_t cSrcSize, -- void* workSpace, size_t wkspSize) --{ -- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); --} -- -- - #endif /* HUF_FORCE_DECOMPRESS_X1 */ - - -@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, - /* Universal decompression selectors */ - /* ***********************************/ - --size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, -- const void* cSrc, size_t cSrcSize, -- const HUF_DTable* DTable) --{ -- DTableDesc const dtd = HUF_getDTableDesc(DTable); --#if defined(HUF_FORCE_DECOMPRESS_X1) -- (void)dtd; -- assert(dtd.tableType == 0); -- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); --#elif defined(HUF_FORCE_DECOMPRESS_X2) -- (void)dtd; -- assert(dtd.tableType == 1); -- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); --#else -- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : -- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); --#endif --} -- --size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, -- const void* cSrc, size_t cSrcSize, -- const HUF_DTable* DTable) --{ -- DTableDesc const dtd = HUF_getDTableDesc(DTable); --#if defined(HUF_FORCE_DECOMPRESS_X1) -- (void)dtd; -- assert(dtd.tableType == 0); -- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); --#elif defined(HUF_FORCE_DECOMPRESS_X2) -- (void)dtd; -- assert(dtd.tableType == 1); -- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); --#else -- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : -- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); --#endif --} -- - - #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) - typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; -@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) - #endif - } - -- --size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, -- size_t dstSize, const void* cSrc, -- size_t cSrcSize, void* workSpace, -- size_t wkspSize) --{ -- /* validation checks */ -- if (dstSize == 0) return ERROR(dstSize_tooSmall); -- if (cSrcSize == 0) return ERROR(corruption_detected); -- -- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); --#if defined(HUF_FORCE_DECOMPRESS_X1) -- (void)algoNb; -- assert(algoNb == 0); -- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); --#elif defined(HUF_FORCE_DECOMPRESS_X2) -- (void)algoNb; -- assert(algoNb == 1); -- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); --#else -- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, -- cSrcSize, workSpace, wkspSize): -- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); --#endif -- } --} -- - size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, - const void* cSrc, size_t cSrcSize, -- void* workSpace, size_t wkspSize) -+ void* workSpace, size_t wkspSize, int flags) - { - /* validation checks */ - if (dstSize == 0) return ERROR(dstSize_tooSmall); -@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, - (void)algoNb; - assert(algoNb == 0); - return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, -- cSrcSize, workSpace, wkspSize); -+ cSrcSize, workSpace, wkspSize, flags); - #elif defined(HUF_FORCE_DECOMPRESS_X2) - (void)algoNb; - assert(algoNb == 1); - return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, -- cSrcSize, workSpace, wkspSize); -+ cSrcSize, workSpace, wkspSize, flags); - #else - return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, -- cSrcSize, workSpace, wkspSize): -+ cSrcSize, workSpace, wkspSize, flags): - HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, -- cSrcSize, workSpace, wkspSize); -+ cSrcSize, workSpace, wkspSize, flags); - #endif - } - } - - --size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) -+size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) - { - DTableDesc const dtd = HUF_getDTableDesc(DTable); - #if defined(HUF_FORCE_DECOMPRESS_X1) - (void)dtd; - assert(dtd.tableType == 0); -- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); -+ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); - #elif defined(HUF_FORCE_DECOMPRESS_X2) - (void)dtd; - assert(dtd.tableType == 1); -- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); -+ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); - #else -- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : -- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); -+ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : -+ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); - #endif - } - - #ifndef HUF_FORCE_DECOMPRESS_X2 --size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) -+size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) - { - const BYTE* ip = (const BYTE*) cSrc; - -- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); -+ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); - if (HUF_isError(hSize)) return hSize; - if (hSize >= cSrcSize) return ERROR(srcSize_wrong); - ip += hSize; cSrcSize -= hSize; - -- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); -+ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); - } - #endif - --size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) -+size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) - { - DTableDesc const dtd = HUF_getDTableDesc(DTable); - #if defined(HUF_FORCE_DECOMPRESS_X1) - (void)dtd; - assert(dtd.tableType == 0); -- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); -+ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); - #elif defined(HUF_FORCE_DECOMPRESS_X2) - (void)dtd; - assert(dtd.tableType == 1); -- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); -+ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); - #else -- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : -- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); -+ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : -+ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); - #endif - } - --size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) -+size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) - { - /* validation checks */ - if (dstSize == 0) return ERROR(dstSize_tooSmall); -@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds - #if defined(HUF_FORCE_DECOMPRESS_X1) - (void)algoNb; - assert(algoNb == 0); -- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); -+ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); - #elif defined(HUF_FORCE_DECOMPRESS_X2) - (void)algoNb; - assert(algoNb == 1); -- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); -+ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); - #else -- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : -- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); -+ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : -+ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); - #endif - } - } -- -diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c -index dbbc7919de53..30ef65e1ab5c 100644 ---- a/lib/zstd/decompress/zstd_ddict.c -+++ b/lib/zstd/decompress/zstd_ddict.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -14,12 +15,12 @@ - /*-******************************************************* - * Dependencies - *********************************************************/ -+#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ - #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ - #include "../common/cpu.h" /* bmi2 */ - #include "../common/mem.h" /* low level memory routines */ - #define FSE_STATIC_LINKING_ONLY - #include "../common/fse.h" --#define HUF_STATIC_LINKING_ONLY - #include "../common/huf.h" - #include "zstd_decompress_internal.h" - #include "zstd_ddict.h" -@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, - ZSTD_memcpy(internalBuffer, dict, dictSize); - } - ddict->dictSize = dictSize; -- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ -+ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ - - /* parse dictionary content */ - FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); -@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) - unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) - { - if (ddict==NULL) return 0; -- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); -+ return ddict->dictID; - } -diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h -index 8c1a79d666f8..de459a0dacd1 100644 ---- a/lib/zstd/decompress/zstd_ddict.h -+++ b/lib/zstd/decompress/zstd_ddict.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c -index 6b3177c94711..c9cbc45f6ed9 100644 ---- a/lib/zstd/decompress/zstd_decompress.c -+++ b/lib/zstd/decompress/zstd_decompress.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -53,13 +54,15 @@ - * Dependencies - *********************************************************/ - #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ -+#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ -+#include "../common/error_private.h" -+#include "../common/zstd_internal.h" /* blockProperties_t */ - #include "../common/mem.h" /* low level memory routines */ -+#include "../common/bits.h" /* ZSTD_highbit32 */ - #define FSE_STATIC_LINKING_ONLY - #include "../common/fse.h" --#define HUF_STATIC_LINKING_ONLY - #include "../common/huf.h" - #include /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */ --#include "../common/zstd_internal.h" /* blockProperties_t */ - #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ - #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ - #include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */ -@@ -72,11 +75,11 @@ - *************************************/ - - #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 --#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. -- * Currently, that means a 0.75 load factor. -- * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded -- * the load factor of the ddict hash set. -- */ -+#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. -+ * Currently, that means a 0.75 load factor. -+ * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded -+ * the load factor of the ddict hash set. -+ */ - - #define DDICT_HASHSET_TABLE_BASE_SIZE 64 - #define DDICT_HASHSET_RESIZE_FACTOR 2 -@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) - dctx->outBufferMode = ZSTD_bm_buffered; - dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; - dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; -+ dctx->disableHufAsm = 0; -+ dctx->maxBlockSizeParam = 0; - } - - static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) -@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) - dctx->streamStage = zdss_init; - dctx->noForwardProgress = 0; - dctx->oversizedDuration = 0; -+ dctx->isFrameDecompression = 1; - #if DYNAMIC_BMI2 - dctx->bmi2 = ZSTD_cpuSupportsBmi2(); - #endif -@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) - * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless - * @return : 0, `zfhPtr` is correctly filled, - * >0, `srcSize` is too small, value is wanted `srcSize` amount, -- * or an error code, which can be tested using ZSTD_isError() */ -+** or an error code, which can be tested using ZSTD_isError() */ - size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) - { - const BYTE* ip = (const BYTE*)src; - size_t const minInputSize = ZSTD_startingInputLength(format); - -- ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */ -- if (srcSize < minInputSize) return minInputSize; -- RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter"); -+ DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize); -+ -+ if (srcSize > 0) { -+ /* note : technically could be considered an assert(), since it's an invalid entry */ -+ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0"); -+ } -+ if (srcSize < minInputSize) { -+ if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) { -+ /* when receiving less than @minInputSize bytes, -+ * control these bytes at least correspond to a supported magic number -+ * in order to error out early if they don't. -+ **/ -+ size_t const toCopy = MIN(4, srcSize); -+ unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER); -+ assert(src != NULL); -+ ZSTD_memcpy(hbuf, src, toCopy); -+ if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) { -+ /* not a zstd frame : let's check if it's a skippable frame */ -+ MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START); -+ ZSTD_memcpy(hbuf, src, toCopy); -+ if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) { -+ RETURN_ERROR(prefix_unknown, -+ "first bytes don't correspond to any supported magic number"); -+ } } } -+ return minInputSize; -+ } - -+ ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */ - if ( (format != ZSTD_f_zstd1_magicless) - && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { - if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { -@@ -540,61 +570,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize) - sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); - RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, - frameParameter_unsupported, ""); -- { -- size_t const skippableSize = skippableHeaderSize + sizeU32; -+ { size_t const skippableSize = skippableHeaderSize + sizeU32; - RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); - return skippableSize; - } - } - - /*! ZSTD_readSkippableFrame() : -- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. -+ * Retrieves content of a skippable frame, and writes it to dst buffer. - * - * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, - * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested - * in the magicVariant. - * -- * Returns an error if destination buffer is not large enough, or if the frame is not skippable. -+ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame. - * - * @return : number of bytes written or a ZSTD error. - */ --ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, -- const void* src, size_t srcSize) -+size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, -+ unsigned* magicVariant, /* optional, can be NULL */ -+ const void* src, size_t srcSize) - { -- U32 const magicNumber = MEM_readLE32(src); -- size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); -- size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; -- -- /* check input validity */ -- RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); -- RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); -- RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); -+ RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); - -- /* deliver payload */ -- if (skippableContentSize > 0 && dst != NULL) -- ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); -- if (magicVariant != NULL) -- *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; -- return skippableContentSize; -+ { U32 const magicNumber = MEM_readLE32(src); -+ size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); -+ size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; -+ -+ /* check input validity */ -+ RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); -+ RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); -+ RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); -+ -+ /* deliver payload */ -+ if (skippableContentSize > 0 && dst != NULL) -+ ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); -+ if (magicVariant != NULL) -+ *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; -+ return skippableContentSize; -+ } - } - - /* ZSTD_findDecompressedSize() : -- * compatible with legacy mode - * `srcSize` must be the exact length of some number of ZSTD compressed and/or - * skippable frames -- * @return : decompressed size of the frames contained */ -+ * note: compatible with legacy mode -+ * @return : decompressed size of the frames contained */ - unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) - { -- unsigned long long totalDstSize = 0; -+ U64 totalDstSize = 0; - - while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) { - U32 const magicNumber = MEM_readLE32(src); - - if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { - size_t const skippableSize = readSkippableFrameSize(src, srcSize); -- if (ZSTD_isError(skippableSize)) { -- return ZSTD_CONTENTSIZE_ERROR; -- } -+ if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR; - assert(skippableSize <= srcSize); - - src = (const BYTE *)src + skippableSize; -@@ -602,17 +633,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) - continue; - } - -- { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); -- if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; -+ { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); -+ if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; - -- /* check for overflow */ -- if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; -- totalDstSize += ret; -+ if (U64_MAX - totalDstSize < fcs) -+ return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */ -+ totalDstSize += fcs; - } -+ /* skip to next frame */ - { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); -- if (ZSTD_isError(frameSrcSize)) { -- return ZSTD_CONTENTSIZE_ERROR; -- } -+ if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR; -+ assert(frameSrcSize <= srcSize); - - src = (const BYTE *)src + frameSrcSize; - srcSize -= frameSrcSize; -@@ -676,13 +707,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret) - return frameSizeInfo; - } - --static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize) -+static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format) - { - ZSTD_frameSizeInfo frameSizeInfo; - ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); - - -- if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE) -+ if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE) - && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { - frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); - assert(ZSTD_isError(frameSizeInfo.compressedSize) || -@@ -696,7 +727,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize - ZSTD_frameHeader zfh; - - /* Extract Frame Header */ -- { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); -+ { size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format); - if (ZSTD_isError(ret)) - return ZSTD_errorFrameSizeInfo(ret); - if (ret > 0) -@@ -730,23 +761,26 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize - ip += 4; - } - -+ frameSizeInfo.nbBlocks = nbBlocks; - frameSizeInfo.compressedSize = (size_t)(ip - ipstart); - frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) - ? zfh.frameContentSize -- : nbBlocks * zfh.blockSizeMax; -+ : (unsigned long long)nbBlocks * zfh.blockSizeMax; - return frameSizeInfo; - } - } - -+static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) { -+ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format); -+ return frameSizeInfo.compressedSize; -+} -+ - /* ZSTD_findFrameCompressedSize() : -- * compatible with legacy mode -- * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame -- * `srcSize` must be at least as large as the frame contained -- * @return : the compressed size of the frame starting at `src` */ -+ * See docs in zstd.h -+ * Note: compatible with legacy mode */ - size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) - { -- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); -- return frameSizeInfo.compressedSize; -+ return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1); - } - - /* ZSTD_decompressBound() : -@@ -760,7 +794,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) - unsigned long long bound = 0; - /* Iterate over each frame */ - while (srcSize > 0) { -- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); -+ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); - size_t const compressedSize = frameSizeInfo.compressedSize; - unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; - if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) -@@ -773,6 +807,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) - return bound; - } - -+size_t ZSTD_decompressionMargin(void const* src, size_t srcSize) -+{ -+ size_t margin = 0; -+ unsigned maxBlockSize = 0; -+ -+ /* Iterate over each frame */ -+ while (srcSize > 0) { -+ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); -+ size_t const compressedSize = frameSizeInfo.compressedSize; -+ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; -+ ZSTD_frameHeader zfh; -+ -+ FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), ""); -+ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) -+ return ERROR(corruption_detected); -+ -+ if (zfh.frameType == ZSTD_frame) { -+ /* Add the frame header to our margin */ -+ margin += zfh.headerSize; -+ /* Add the checksum to our margin */ -+ margin += zfh.checksumFlag ? 4 : 0; -+ /* Add 3 bytes per block */ -+ margin += 3 * frameSizeInfo.nbBlocks; -+ -+ /* Compute the max block size */ -+ maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax); -+ } else { -+ assert(zfh.frameType == ZSTD_skippableFrame); -+ /* Add the entire skippable frame size to our margin. */ -+ margin += compressedSize; -+ } -+ -+ assert(srcSize >= compressedSize); -+ src = (const BYTE*)src + compressedSize; -+ srcSize -= compressedSize; -+ } -+ -+ /* Add the max block size back to the margin. */ -+ margin += maxBlockSize; -+ -+ return margin; -+} - - /*-************************************************************* - * Frame decoding -@@ -856,6 +932,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, - ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; - } - -+ /* Shrink the blockSizeMax if enabled */ -+ if (dctx->maxBlockSizeParam != 0) -+ dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam); -+ - /* Loop on each block */ - while (1) { - BYTE* oBlockEnd = oend; -@@ -888,7 +968,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, - switch(blockProperties.blockType) - { - case bt_compressed: -- decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming); -+ assert(dctx->isFrameDecompression == 1); -+ decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming); - break; - case bt_raw : - /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ -@@ -901,12 +982,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, - default: - RETURN_ERROR(corruption_detected, "invalid block type"); - } -- -- if (ZSTD_isError(decodedSize)) return decodedSize; -- if (dctx->validateChecksum) -+ FORWARD_IF_ERROR(decodedSize, "Block decompression failure"); -+ DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize); -+ if (dctx->validateChecksum) { - xxh64_update(&dctx->xxhState, op, decodedSize); -- if (decodedSize != 0) -+ } -+ if (decodedSize) /* support dst = NULL,0 */ { - op += decodedSize; -+ } - assert(ip != NULL); - ip += cBlockSize; - remainingSrcSize -= cBlockSize; -@@ -930,12 +1013,15 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, - } - ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); - /* Allow caller to get size read */ -+ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr); - *srcPtr = ip; - *srcSizePtr = remainingSrcSize; - return (size_t)(op-ostart); - } - --static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, -+static -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR -+size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, - const void* dict, size_t dictSize, -@@ -955,17 +1041,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, - while (srcSize >= ZSTD_startingInputLength(dctx->format)) { - - -- { U32 const magicNumber = MEM_readLE32(src); -- DEBUGLOG(4, "reading magic number %08X (expecting %08X)", -- (unsigned)magicNumber, ZSTD_MAGICNUMBER); -+ if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) { -+ U32 const magicNumber = MEM_readLE32(src); -+ DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber); - if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { -+ /* skippable frame detected : skip it */ - size_t const skippableSize = readSkippableFrameSize(src, srcSize); -- FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed"); -+ FORWARD_IF_ERROR(skippableSize, "invalid skippable frame"); - assert(skippableSize <= srcSize); - - src = (const BYTE *)src + skippableSize; - srcSize -= skippableSize; -- continue; -+ continue; /* check next frame */ - } } - - if (ddict) { -@@ -1061,8 +1148,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr - size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } - - /* -- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, -- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can -+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we -+ * allow taking a partial block as the input. Currently only raw uncompressed blocks can - * be streamed. - * - * For blocks that can be streamed, this allows us to reduce the latency until we produce -@@ -1181,7 +1268,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c - { - case bt_compressed: - DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); -- rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming); -+ assert(dctx->isFrameDecompression == 1); -+ rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming); - dctx->expected = 0; /* Streaming not supported */ - break; - case bt_raw : -@@ -1250,6 +1338,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c - case ZSTDds_decodeSkippableHeader: - assert(src != NULL); - assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); -+ assert(dctx->format != ZSTD_f_zstd1_magicless); - ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ - dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ - dctx->stage = ZSTDds_skipFrame; -@@ -1262,7 +1351,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c - - default: - assert(0); /* impossible */ -- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ -+ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ - } - } - -@@ -1303,11 +1392,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, - /* in minimal huffman, we always use X1 variants */ - size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, - dictPtr, dictEnd - dictPtr, -- workspace, workspaceSize); -+ workspace, workspaceSize, /* flags */ 0); - #else - size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, - dictPtr, (size_t)(dictEnd - dictPtr), -- workspace, workspaceSize); -+ workspace, workspaceSize, /* flags */ 0); - #endif - RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); - dictPtr += hSize; -@@ -1403,10 +1492,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) - dctx->prefixStart = NULL; - dctx->virtualStart = NULL; - dctx->dictEnd = NULL; -- dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ -+ dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ - dctx->litEntropy = dctx->fseEntropy = 0; - dctx->dictID = 0; - dctx->bType = bt_reserved; -+ dctx->isFrameDecompression = 1; - ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); - ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ - dctx->LLTptr = dctx->entropy.LLTable; -@@ -1465,7 +1555,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) - * This could for one of the following reasons : - * - The frame does not require a dictionary (most common case). - * - The frame was built with dictID intentionally removed. -- * Needed dictionary is a hidden information. -+ * Needed dictionary is a hidden piece of information. - * Note : this use case also happens when using a non-conformant dictionary. - * - `srcSize` is too small, and as a result, frame header could not be decoded. - * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. -@@ -1474,7 +1564,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) - * ZSTD_getFrameHeader(), which will provide a more precise error code. */ - unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) - { -- ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; -+ ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; - size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); - if (ZSTD_isError(hError)) return 0; - return zfp.dictID; -@@ -1581,7 +1671,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di - size_t ZSTD_initDStream(ZSTD_DStream* zds) - { - DEBUGLOG(4, "ZSTD_initDStream"); -- return ZSTD_initDStream_usingDDict(zds, NULL); -+ FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), ""); -+ FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), ""); -+ return ZSTD_startingInputLength(zds->format); - } - - /* ZSTD_initDStream_usingDDict() : -@@ -1589,6 +1681,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds) - * this function cannot fail */ - size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) - { -+ DEBUGLOG(4, "ZSTD_initDStream_usingDDict"); - FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); - FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); - return ZSTD_startingInputLength(dctx->format); -@@ -1599,6 +1692,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) - * this function cannot fail */ - size_t ZSTD_resetDStream(ZSTD_DStream* dctx) - { -+ DEBUGLOG(4, "ZSTD_resetDStream"); - FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); - return ZSTD_startingInputLength(dctx->format); - } -@@ -1670,6 +1764,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) - bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; - bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; - return bounds; -+ case ZSTD_d_disableHuffmanAssembly: -+ bounds.lowerBound = 0; -+ bounds.upperBound = 1; -+ return bounds; -+ case ZSTD_d_maxBlockSize: -+ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; -+ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; -+ return bounds; -+ - default:; - } - bounds.error = ERROR(parameter_unsupported); -@@ -1710,6 +1813,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value - case ZSTD_d_refMultipleDDicts: - *value = (int)dctx->refMultipleDDicts; - return 0; -+ case ZSTD_d_disableHuffmanAssembly: -+ *value = (int)dctx->disableHufAsm; -+ return 0; -+ case ZSTD_d_maxBlockSize: -+ *value = dctx->maxBlockSizeParam; -+ return 0; - default:; - } - RETURN_ERROR(parameter_unsupported, ""); -@@ -1743,6 +1852,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value - } - dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; - return 0; -+ case ZSTD_d_disableHuffmanAssembly: -+ CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); -+ dctx->disableHufAsm = value != 0; -+ return 0; -+ case ZSTD_d_maxBlockSize: -+ if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value); -+ dctx->maxBlockSizeParam = value; -+ return 0; - default:; - } - RETURN_ERROR(parameter_unsupported, ""); -@@ -1754,6 +1871,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) - || (reset == ZSTD_reset_session_and_parameters) ) { - dctx->streamStage = zdss_init; - dctx->noForwardProgress = 0; -+ dctx->isFrameDecompression = 1; - } - if ( (reset == ZSTD_reset_parameters) - || (reset == ZSTD_reset_session_and_parameters) ) { -@@ -1770,11 +1888,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) - return ZSTD_sizeof_DCtx(dctx); - } - --size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) -+static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax) - { -- size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); -- /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/ -- unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2); -+ size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax); -+ /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block -+ * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing -+ * the block at the beginning of the output buffer, and maintain a full window. -+ * -+ * We need another blockSize worth of buffer so that we can store split -+ * literals at the end of the block without overwriting the extDict window. -+ */ -+ unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2); - unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); - size_t const minRBSize = (size_t) neededSize; - RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, -@@ -1782,6 +1906,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long - return minRBSize; - } - -+size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) -+{ -+ return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX); -+} -+ - size_t ZSTD_estimateDStreamSize(size_t windowSize) - { - size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); -@@ -1918,7 +2047,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB - if (zds->refMultipleDDicts && zds->ddictSet) { - ZSTD_DCtx_selectFrameDDict(zds); - } -- DEBUGLOG(5, "header size : %u", (U32)hSize); - if (ZSTD_isError(hSize)) { - return hSize; /* error */ - } -@@ -1932,6 +2060,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB - zds->lhSize += remainingInput; - } - input->pos = input->size; -+ /* check first few bytes */ -+ FORWARD_IF_ERROR( -+ ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format), -+ "First few bytes detected incorrect" ); -+ /* return hint input size */ - return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ - } - assert(ip != NULL); -@@ -1943,14 +2076,15 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB - if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN - && zds->fParams.frameType != ZSTD_skippableFrame - && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { -- size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart)); -+ size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format); - if (cSize <= (size_t)(iend-istart)) { - /* shortcut : using single-pass mode */ - size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); - if (ZSTD_isError(decompressedSize)) return decompressedSize; -- DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") -+ DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()"); -+ assert(istart != NULL); - ip = istart + cSize; -- op += decompressedSize; -+ op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */ - zds->expected = 0; - zds->streamStage = zdss_init; - someMoreWork = 0; -@@ -1969,7 +2103,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB - DEBUGLOG(4, "Consume header"); - FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); - -- if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ -+ if (zds->format == ZSTD_f_zstd1 -+ && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ - zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); - zds->stage = ZSTDds_skipFrame; - } else { -@@ -1985,11 +2120,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB - zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); - RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, - frameParameter_windowTooLarge, ""); -+ if (zds->maxBlockSizeParam != 0) -+ zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam); - - /* Adapt buffer sizes to frame header instructions */ - { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); - size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered -- ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize) -+ ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax) - : 0; - - ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); -@@ -2034,6 +2171,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB - } - if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ - FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); -+ assert(ip != NULL); - ip += neededInSize; - /* Function modifies the stage so we must break */ - break; -@@ -2048,7 +2186,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB - int const isSkipFrame = ZSTD_isSkipFrame(zds); - size_t loadedSize; - /* At this point we shouldn't be decompressing a block that we can stream. */ -- assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip)); -+ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip))); - if (isSkipFrame) { - loadedSize = MIN(toLoad, (size_t)(iend-ip)); - } else { -@@ -2057,8 +2195,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB - "should never happen"); - loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); - } -- ip += loadedSize; -- zds->inPos += loadedSize; -+ if (loadedSize != 0) { -+ /* ip may be NULL */ -+ ip += loadedSize; -+ zds->inPos += loadedSize; -+ } - if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ - - /* decode loaded input */ -@@ -2068,14 +2209,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB - break; - } - case zdss_flush: -- { size_t const toFlushSize = zds->outEnd - zds->outStart; -+ { -+ size_t const toFlushSize = zds->outEnd - zds->outStart; - size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize); -- op += flushedSize; -+ -+ op = op ? op + flushedSize : op; -+ - zds->outStart += flushedSize; - if (flushedSize == toFlushSize) { /* flush completed */ - zds->streamStage = zdss_read; - if ( (zds->outBuffSize < zds->fParams.frameContentSize) -- && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { -+ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { - DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", - (int)(zds->outBuffSize - zds->outStart), - (U32)zds->fParams.blockSizeMax); -@@ -2089,7 +2233,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB - - default: - assert(0); /* impossible */ -- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ -+ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ - } } - - /* result */ -@@ -2102,8 +2246,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB - if ((ip==istart) && (op==ostart)) { /* no forward progress */ - zds->noForwardProgress ++; - if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { -- RETURN_ERROR_IF(op==oend, dstSize_tooSmall, ""); -- RETURN_ERROR_IF(ip==iend, srcSize_wrong, ""); -+ RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, ""); -+ RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, ""); - assert(0); - } - } else { -@@ -2140,11 +2284,17 @@ size_t ZSTD_decompressStream_simpleArgs ( - void* dst, size_t dstCapacity, size_t* dstPos, - const void* src, size_t srcSize, size_t* srcPos) - { -- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; -- ZSTD_inBuffer input = { src, srcSize, *srcPos }; -- /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ -- size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); -- *dstPos = output.pos; -- *srcPos = input.pos; -- return cErr; -+ ZSTD_outBuffer output; -+ ZSTD_inBuffer input; -+ output.dst = dst; -+ output.size = dstCapacity; -+ output.pos = *dstPos; -+ input.src = src; -+ input.size = srcSize; -+ input.pos = *srcPos; -+ { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); -+ *dstPos = output.pos; -+ *srcPos = input.pos; -+ return cErr; -+ } - } -diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c -index c1913b8e7c89..9fe9a12c8a2c 100644 ---- a/lib/zstd/decompress/zstd_decompress_block.c -+++ b/lib/zstd/decompress/zstd_decompress_block.c -@@ -1,5 +1,6 @@ -+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -20,12 +21,12 @@ - #include "../common/mem.h" /* low level memory routines */ - #define FSE_STATIC_LINKING_ONLY - #include "../common/fse.h" --#define HUF_STATIC_LINKING_ONLY - #include "../common/huf.h" - #include "../common/zstd_internal.h" - #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ - #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ - #include "zstd_decompress_block.h" -+#include "../common/bits.h" /* ZSTD_highbit32 */ - - /*_******************************************************* - * Macros -@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } - * Block decoding - ***************************************************************/ - -+static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) -+{ -+ size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; -+ assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); -+ return blockSizeMax; -+} -+ - /*! ZSTD_getcBlockSize() : - * Provides the size of compressed block from block header `src` */ - size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, -@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, - static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, - const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) - { -- if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) -- { -- /* room for litbuffer to fit without read faulting */ -- dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; -+ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); -+ assert(litSize <= blockSizeMax); -+ assert(dctx->isFrameDecompression || streaming == not_streaming); -+ assert(expectedWriteSize <= blockSizeMax); -+ if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { -+ /* If we aren't streaming, we can just put the literals after the output -+ * of the current block. We don't need to worry about overwriting the -+ * extDict of our window, because it doesn't exist. -+ * So if we have space after the end of the block, just put it there. -+ */ -+ dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; - dctx->litBufferEnd = dctx->litBuffer + litSize; - dctx->litBufferLocation = ZSTD_in_dst; -- } -- else if (litSize > ZSTD_LITBUFFEREXTRASIZE) -- { -- /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ -+ } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { -+ /* Literals fit entirely within the extra buffer, put them there to avoid -+ * having to split the literals. -+ */ -+ dctx->litBuffer = dctx->litExtraBuffer; -+ dctx->litBufferEnd = dctx->litBuffer + litSize; -+ dctx->litBufferLocation = ZSTD_not_in_dst; -+ } else { -+ assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); -+ /* Literals must be split between the output block and the extra lit -+ * buffer. We fill the extra lit buffer with the tail of the literals, -+ * and put the rest of the literals at the end of the block, with -+ * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. -+ * This MUST not write more than our maxBlockSize beyond dst, because in -+ * streaming mode, that could overwrite part of our extDict window. -+ */ - if (splitImmediately) { - /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ - dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; - dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; -- } -- else { -- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ -+ } else { -+ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ - dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; - dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; - } - dctx->litBufferLocation = ZSTD_split; -- } -- else -- { -- /* fits entirely within litExtraBuffer, so no split is necessary */ -- dctx->litBuffer = dctx->litExtraBuffer; -- dctx->litBufferEnd = dctx->litBuffer + litSize; -- dctx->litBufferLocation = ZSTD_not_in_dst; -+ assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); - } - } - --/* Hidden declaration for fullbench */ --size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, -- const void* src, size_t srcSize, -- void* dst, size_t dstCapacity, const streaming_operation streaming); - /*! ZSTD_decodeLiteralsBlock() : - * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored - * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current -@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - * - * @return : nb of bytes read from src (< srcSize ) - * note : symbol not declared but exposed for fullbench */ --size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, -+static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ - void* dst, size_t dstCapacity, const streaming_operation streaming) - { -@@ -125,6 +141,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - - { const BYTE* const istart = (const BYTE*) src; - symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); -+ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); - - switch(litEncType) - { -@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - ZSTD_FALLTHROUGH; - - case set_compressed: -- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); -+ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); - { size_t lhSize, litSize, litCSize; - U32 singleStream=0; - U32 const lhlCode = (istart[0] >> 2) & 3; - U32 const lhc = MEM_readLE32(istart); - size_t hufSuccess; -- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); -+ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); -+ int const flags = 0 -+ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) -+ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); - switch(lhlCode) - { - case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ -@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - break; - } - RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); -- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); -+ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); -+ if (!singleStream) -+ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, -+ "Not enough literals (%zu) for the 4-streams mode (min %u)", -+ litSize, MIN_LITERALS_FOR_4_STREAMS); - RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); - RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); - ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); -@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - - if (litEncType==set_repeat) { - if (singleStream) { -- hufSuccess = HUF_decompress1X_usingDTable_bmi2( -+ hufSuccess = HUF_decompress1X_usingDTable( - dctx->litBuffer, litSize, istart+lhSize, litCSize, -- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); -+ dctx->HUFptr, flags); - } else { -- hufSuccess = HUF_decompress4X_usingDTable_bmi2( -+ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); -+ hufSuccess = HUF_decompress4X_usingDTable( - dctx->litBuffer, litSize, istart+lhSize, litCSize, -- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); -+ dctx->HUFptr, flags); - } - } else { - if (singleStream) { -@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - hufSuccess = HUF_decompress1X_DCtx_wksp( - dctx->entropy.hufTable, dctx->litBuffer, litSize, - istart+lhSize, litCSize, dctx->workspace, -- sizeof(dctx->workspace)); -+ sizeof(dctx->workspace), flags); - #else -- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( -+ hufSuccess = HUF_decompress1X1_DCtx_wksp( - dctx->entropy.hufTable, dctx->litBuffer, litSize, - istart+lhSize, litCSize, dctx->workspace, -- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); -+ sizeof(dctx->workspace), flags); - #endif - } else { -- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( -+ hufSuccess = HUF_decompress4X_hufOnly_wksp( - dctx->entropy.hufTable, dctx->litBuffer, litSize, - istart+lhSize, litCSize, dctx->workspace, -- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); -+ sizeof(dctx->workspace), flags); - } - } - if (dctx->litBufferLocation == ZSTD_split) - { -+ assert(litSize > ZSTD_LITBUFFEREXTRASIZE); - ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); - ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); - dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; - dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; -+ assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); - } - - RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); -@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - case set_basic: - { size_t litSize, lhSize; - U32 const lhlCode = ((istart[0]) >> 2) & 3; -- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); -+ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); - switch(lhlCode) - { - case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ -@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - break; - case 3: - lhSize = 3; -+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); - litSize = MEM_readLE24(istart) >> 4; - break; - } - - RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); -+ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); - RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); - ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); - if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ -@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - case set_rle: - { U32 const lhlCode = ((istart[0]) >> 2) & 3; - size_t litSize, lhSize; -- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); -+ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); - switch(lhlCode) - { - case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ -@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - break; - case 1: - lhSize = 2; -+ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); - litSize = MEM_readLE16(istart) >> 4; - break; - case 3: - lhSize = 3; -+ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); - litSize = MEM_readLE24(istart) >> 4; -- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); - break; - } - RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); -- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); -+ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); - RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); - ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); - if (dctx->litBufferLocation == ZSTD_split) -@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - } - } - -+/* Hidden declaration for fullbench */ -+size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, -+ const void* src, size_t srcSize, -+ void* dst, size_t dstCapacity); -+size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, -+ const void* src, size_t srcSize, -+ void* dst, size_t dstCapacity) -+{ -+ dctx->isFrameDecompression = 0; -+ return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); -+} -+ - /* Default FSE distribution tables. - * These are pre-calculated FSE decoding tables using default distributions as defined in specification : - * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions -@@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, - for (i = 8; i < n; i += 8) { - MEM_write64(spread + pos + i, sv); - } -- pos += n; -+ assert(n>=0); -+ pos += (size_t)n; - } - } - /* Now we spread those positions across the table. -- * The benefit of doing it in two stages is that we avoid the the -+ * The benefit of doing it in two stages is that we avoid the - * variable size inner loop, which caused lots of branch misses. - * Now we can run through all the positions without any branch misses. -- * We unroll the loop twice, since that is what emperically worked best. -+ * We unroll the loop twice, since that is what empirically worked best. - */ - { - size_t position = 0; -@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, - for (i=0; i highThreshold) position = (position + step) & tableMask; /* lowprob area */ -+ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ - } } - assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ - } -@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, - for (u=0; u 0x7F) { - if (nbSeq == 0xFF) { - RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); -@@ -681,8 +719,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, - } - *nbSeqPtr = nbSeq; - -+ if (nbSeq == 0) { -+ /* No sequence : section ends immediately */ -+ RETURN_ERROR_IF(ip != iend, corruption_detected, -+ "extraneous data present in the Sequences section"); -+ return (size_t)(ip - istart); -+ } -+ - /* FSE table descriptors */ - RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ -+ RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ - { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); - symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); - symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); -@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt - /* ZSTD_safecopyDstBeforeSrc(): - * This version allows overlap with dst before src, or handles the non-overlap case with dst after src - * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ --static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { -+static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { - ptrdiff_t const diff = op - ip; - BYTE* const oend = op + length; - -@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length - * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). - */ - FORCE_NOINLINE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - size_t ZSTD_execSequenceEnd(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, -@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op, - * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. - */ - FORCE_NOINLINE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, - BYTE* const oend, const BYTE* const oend_w, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, -@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, - } - - HINT_INLINE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - size_t ZSTD_execSequence(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, -@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op, - - assert(op != NULL /* Precondition */); - assert(oend_w < oend /* No underflow */); -+ -+#if defined(__aarch64__) -+ /* prefetch sequence starting from match that will be used for copy later */ -+ PREFETCH_L1(match); -+#endif - /* Handle edge cases in a slow path: - * - Read beyond end of literals - * - Match end is within WILDCOPY_OVERLIMIT of oend -@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op, - } - - HINT_INLINE -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, - BYTE* const oend, const BYTE* const oend_w, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, -@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 - } - - /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum -- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1) -+ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 - * bits before reloading. This value is the maximum number of bytes we read - * after reloading when we are decoding long offsets. - */ -@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 - - typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; - -+/* -+ * ZSTD_decodeSequence(): -+ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets -+ * only used in 32-bit mode -+ * @return : Sequence (litL + matchL + offset) -+ */ - FORCE_INLINE_TEMPLATE seq_t --ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) -+ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) - { - seq_t seq; -+ /* -+ * ZSTD_seqSymbol is a 64 bits wide structure. -+ * It can be loaded in one operation -+ * and its fields extracted by simply shifting or bit-extracting on aarch64. -+ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh -+ * operations that cause performance drop. This can be avoided by using this -+ * ZSTD_memcpy hack. -+ */ -+#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) -+ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; -+ ZSTD_seqSymbol* const llDInfo = &llDInfoS; -+ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; -+ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; -+ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); -+ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); -+ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); -+#else - const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; - const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; - const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; -+#endif - seq.matchLength = mlDInfo->baseValue; - seq.litLength = llDInfo->baseValue; - { U32 const ofBase = ofDInfo->baseValue; -@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) - U32 const llnbBits = llDInfo->nbBits; - U32 const mlnbBits = mlDInfo->nbBits; - U32 const ofnbBits = ofDInfo->nbBits; -+ -+ assert(llBits <= MaxLLBits); -+ assert(mlBits <= MaxMLBits); -+ assert(ofBits <= MaxOff); - /* - * As gcc has better branch and block analyzers, sometimes it is only -- * valuable to mark likelyness for clang, it gives around 3-4% of -+ * valuable to mark likeliness for clang, it gives around 3-4% of - * performance. - */ - - /* sequence */ - { size_t offset; -- #if defined(__clang__) -- if (LIKELY(ofBits > 1)) { -- #else - if (ofBits > 1) { -- #endif - ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); - ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); -- assert(ofBits <= MaxOff); -+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); -+ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); - if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { -- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); -+ /* Always read extra bits, this keeps the logic simple, -+ * avoids branches, and avoids accidentally reading 0 bits. -+ */ -+ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; - offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); - BIT_reloadDStream(&seqState->DStream); -- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); -- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ -+ offset += BIT_readBitsFast(&seqState->DStream, extraBits); - } else { - offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); -@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) - } else { - offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); - { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; -- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ -+ temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ - if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset = temp; -@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) - seq.offset = offset; - } - -- #if defined(__clang__) -- if (UNLIKELY(mlBits > 0)) -- #else - if (mlBits > 0) -- #endif - seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); - - if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) -@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) - /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ - ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); - -- #if defined(__clang__) -- if (UNLIKELY(llBits > 0)) -- #else - if (llBits > 0) -- #endif - seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); - - if (MEM_32bits()) -@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) - DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", - (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); - -- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ -- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ -- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ -- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ -+ if (!isLastSeq) { -+ /* don't update FSE state for last Sequence */ -+ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ -+ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ -+ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ -+ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ -+ BIT_reloadDStream(&seqState->DStream); -+ } - } - - return seq; - } - --#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION --MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) -+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) -+#if DEBUGLEVEL >= 1 -+static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) - { - size_t const windowSize = dctx->fParams.windowSize; - /* No dictionary used. */ -@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix - /* Dictionary is active. */ - return 1; - } -+#endif - --MEM_STATIC void ZSTD_assertValidSequence( -+static void ZSTD_assertValidSequence( - ZSTD_DCtx const* dctx, - BYTE const* op, BYTE const* oend, - seq_t const seq, - BYTE const* prefixStart, BYTE const* virtualStart) - { - #if DEBUGLEVEL >= 1 -- size_t const windowSize = dctx->fParams.windowSize; -- size_t const sequenceSize = seq.litLength + seq.matchLength; -- BYTE const* const oLitEnd = op + seq.litLength; -- DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", -- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); -- assert(op <= oend); -- assert((size_t)(oend - op) >= sequenceSize); -- assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX); -- if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { -- size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); -- /* Offset must be within the dictionary. */ -- assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); -- assert(seq.offset <= windowSize + dictSize); -- } else { -- /* Offset must be within our window. */ -- assert(seq.offset <= windowSize); -+ if (dctx->isFrameDecompression) { -+ size_t const windowSize = dctx->fParams.windowSize; -+ size_t const sequenceSize = seq.litLength + seq.matchLength; -+ BYTE const* const oLitEnd = op + seq.litLength; -+ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", -+ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); -+ assert(op <= oend); -+ assert((size_t)(oend - op) >= sequenceSize); -+ assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); -+ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { -+ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); -+ /* Offset must be within the dictionary. */ -+ assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); -+ assert(seq.offset <= windowSize + dictSize); -+ } else { -+ /* Offset must be within our window. */ -+ assert(seq.offset <= windowSize); -+ } - } - #else - (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; -@@ -1322,23 +1404,21 @@ DONT_VECTORIZE - ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { - const BYTE* ip = (const BYTE*)seqStart; - const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE*)dst; -- BYTE* const oend = ostart + maxDstSize; -+ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); - BYTE* op = ostart; - const BYTE* litPtr = dctx->litPtr; - const BYTE* litBufferEnd = dctx->litBufferEnd; - const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); - const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); - const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); -- DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); -- (void)frame; -+ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); - -- /* Regen sequences */ -+ /* Literals are split between internal buffer & output buffer */ - if (nbSeq) { - seqState_t seqState; - dctx->fseEntropy = 1; -@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, - BIT_DStream_completed < BIT_DStream_overflow); - - /* decompress without overrunning litPtr begins */ -- { -- seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); -+ { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ - /* Align the decompression loop to 32 + 16 bytes. - * - * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression -@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, - #endif - - /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ -- for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { -- size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); -+ for ( ; nbSeq; nbSeq--) { -+ sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); -+ if (litPtr + sequence.litLength > dctx->litBufferEnd) break; -+ { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); - #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) -- assert(!ZSTD_isError(oneSeqSize)); -- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); -+ assert(!ZSTD_isError(oneSeqSize)); -+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); - #endif -- if (UNLIKELY(ZSTD_isError(oneSeqSize))) -- return oneSeqSize; -- DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); -- op += oneSeqSize; -- if (UNLIKELY(!--nbSeq)) -- break; -- BIT_reloadDStream(&(seqState.DStream)); -- sequence = ZSTD_decodeSequence(&seqState, isLongOffset); -- } -+ if (UNLIKELY(ZSTD_isError(oneSeqSize))) -+ return oneSeqSize; -+ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); -+ op += oneSeqSize; -+ } } -+ DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); - - /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ - if (nbSeq > 0) { - const size_t leftoverLit = dctx->litBufferEnd - litPtr; -- if (leftoverLit) -- { -+ DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); -+ if (leftoverLit) { - RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); - ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); - sequence.litLength -= leftoverLit; -@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, - litPtr = dctx->litExtraBuffer; - litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; - dctx->litBufferLocation = ZSTD_not_in_dst; -- { -- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); -+ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); - #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); -- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); -+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); - #endif - if (UNLIKELY(ZSTD_isError(oneSeqSize))) - return oneSeqSize; - DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); - op += oneSeqSize; -- if (--nbSeq) -- BIT_reloadDStream(&(seqState.DStream)); - } -+ nbSeq--; - } - } - -- if (nbSeq > 0) /* there is remaining lit from extra buffer */ -- { -+ if (nbSeq > 0) { -+ /* there is remaining lit from extra buffer */ - - #if defined(__x86_64__) - __asm__(".p2align 6"); -@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, - # endif - #endif - -- for (; ; ) { -- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); -+ for ( ; nbSeq ; nbSeq--) { -+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); - #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); -- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); -+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); - #endif - if (UNLIKELY(ZSTD_isError(oneSeqSize))) - return oneSeqSize; - DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); - op += oneSeqSize; -- if (UNLIKELY(!--nbSeq)) -- break; -- BIT_reloadDStream(&(seqState.DStream)); - } - } - - /* check if reached exact end */ - DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); - RETURN_ERROR_IF(nbSeq, corruption_detected, ""); -- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); -+ DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); -+ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); - /* save reps for next block */ - { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } - } - - /* last literal segment */ -- if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ -- { -- size_t const lastLLSize = litBufferEnd - litPtr; -+ if (dctx->litBufferLocation == ZSTD_split) { -+ /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ -+ size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); -+ DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); - RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); - if (op != NULL) { - ZSTD_memmove(op, litPtr, lastLLSize); -@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, - litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; - dctx->litBufferLocation = ZSTD_not_in_dst; - } -- { size_t const lastLLSize = litBufferEnd - litPtr; -+ /* copy last literals from internal buffer */ -+ { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); -+ DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); - RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); - if (op != NULL) { - ZSTD_memcpy(op, litPtr, lastLLSize); - op += lastLLSize; -- } -- } -+ } } - -- return op-ostart; -+ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); -+ return (size_t)(op - ostart); - } - - FORCE_INLINE_TEMPLATE size_t -@@ -1539,21 +1616,19 @@ DONT_VECTORIZE - ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { - const BYTE* ip = (const BYTE*)seqStart; - const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE*)dst; -- BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; -+ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; - BYTE* op = ostart; - const BYTE* litPtr = dctx->litPtr; - const BYTE* const litEnd = litPtr + dctx->litSize; - const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); - const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); - const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); -- DEBUGLOG(5, "ZSTD_decompressSequences_body"); -- (void)frame; -+ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); - - /* Regen sequences */ - if (nbSeq) { -@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, - ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); - assert(dst != NULL); - -- ZSTD_STATIC_ASSERT( -- BIT_DStream_unfinished < BIT_DStream_completed && -- BIT_DStream_endOfBuffer < BIT_DStream_completed && -- BIT_DStream_completed < BIT_DStream_overflow); -- - #if defined(__x86_64__) - __asm__(".p2align 6"); - __asm__("nop"); -@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, - # endif - #endif - -- for ( ; ; ) { -- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); -+ for ( ; nbSeq ; nbSeq--) { -+ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); - #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); -- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); -+ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); - #endif - if (UNLIKELY(ZSTD_isError(oneSeqSize))) - return oneSeqSize; - DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); - op += oneSeqSize; -- if (UNLIKELY(!--nbSeq)) -- break; -- BIT_reloadDStream(&(seqState.DStream)); - } - - /* check if reached exact end */ -- DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); -- RETURN_ERROR_IF(nbSeq, corruption_detected, ""); -- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); -+ assert(nbSeq == 0); -+ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); - /* save reps for next block */ - { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } - } - - /* last literal segment */ -- { size_t const lastLLSize = litEnd - litPtr; -+ { size_t const lastLLSize = (size_t)(litEnd - litPtr); -+ DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); - RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); - if (op != NULL) { - ZSTD_memcpy(op, litPtr, lastLLSize); - op += lastLLSize; -- } -- } -+ } } - -- return op-ostart; -+ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); -+ return (size_t)(op - ostart); - } - - static size_t - ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { -- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - - static size_t - ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { -- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ - - #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT - --FORCE_INLINE_TEMPLATE size_t --ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, -+FORCE_INLINE_TEMPLATE -+ -+size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, - const BYTE* const prefixStart, const BYTE* const dictEnd) - { - prefetchPos += sequence.litLength; - { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; -- const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. -- * No consequence though : memory address is only used for prefetching, not for dereferencing */ -+ /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. -+ * No consequence though : memory address is only used for prefetching, not for dereferencing */ -+ const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); - PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ - } - return prefetchPos + sequence.matchLength; -@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body( - ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { - const BYTE* ip = (const BYTE*)seqStart; - const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE*)dst; -- BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize; -+ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); - BYTE* op = ostart; - const BYTE* litPtr = dctx->litPtr; - const BYTE* litBufferEnd = dctx->litBufferEnd; - const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); - const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); - const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); -- (void)frame; - - /* Regen sequences */ - if (nbSeq) { -@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body( - ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); - - /* prepare in advance */ -- for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNblitBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) -- { -+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { - /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ - const size_t leftoverLit = dctx->litBufferEnd - litPtr; - if (leftoverLit) -@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body( - litPtr = dctx->litExtraBuffer; - litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; - dctx->litBufferLocation = ZSTD_not_in_dst; -- oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); -+ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); - #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) -- assert(!ZSTD_isError(oneSeqSize)); -- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); -+ assert(!ZSTD_isError(oneSeqSize)); -+ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); - #endif -- if (ZSTD_isError(oneSeqSize)) return oneSeqSize; -+ if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - -- prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); -- sequences[seqNb & STORED_SEQS_MASK] = sequence; -- op += oneSeqSize; -- } -+ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); -+ sequences[seqNb & STORED_SEQS_MASK] = sequence; -+ op += oneSeqSize; -+ } } - else - { - /* lit buffer is either wholly contained in first or second split, or not split at all*/ -- oneSeqSize = dctx->litBufferLocation == ZSTD_split ? -+ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? - ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : - ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); - #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); -- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); -+ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); - #endif - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - -@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body( - op += oneSeqSize; - } - } -- RETURN_ERROR_IF(seqNblitBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) -- { -+ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { - const size_t leftoverLit = dctx->litBufferEnd - litPtr; -- if (leftoverLit) -- { -+ if (leftoverLit) { - RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); - ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); - sequence->litLength -= leftoverLit; -@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body( - litPtr = dctx->litExtraBuffer; - litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; - dctx->litBufferLocation = ZSTD_not_in_dst; -- { -- size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); -+ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); - #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); -- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); -+ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); - #endif - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - op += oneSeqSize; -@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body( - ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); - #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); -- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); -+ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); - #endif - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - op += oneSeqSize; -@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body( - } - - /* last literal segment */ -- if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */ -- { -+ if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ - size_t const lastLLSize = litBufferEnd - litPtr; - RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); - if (op != NULL) { -@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body( - } - } - -- return op-ostart; -+ return (size_t)(op - ostart); - } - - static size_t - ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { -- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ - -@@ -1851,20 +1908,18 @@ DONT_VECTORIZE - ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { -- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - static BMI2_TARGET_ATTRIBUTE size_t - DONT_VECTORIZE - ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { -- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ - -@@ -1873,10 +1928,9 @@ static BMI2_TARGET_ATTRIBUTE size_t - ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { -- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ - -@@ -1886,37 +1940,34 @@ typedef size_t (*ZSTD_decompressSequences_t)( - ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame); -+ const ZSTD_longOffset_e isLongOffset); - - #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG - static size_t - ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { - DEBUGLOG(5, "ZSTD_decompressSequences"); - #if DYNAMIC_BMI2 - if (ZSTD_DCtx_get_bmi2(dctx)) { -- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - #endif -- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - static size_t - ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { - DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); - #if DYNAMIC_BMI2 - if (ZSTD_DCtx_get_bmi2(dctx)) { -- return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - #endif -- return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ - -@@ -1931,69 +1982,114 @@ static size_t - ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize, int nbSeq, -- const ZSTD_longOffset_e isLongOffset, -- const int frame) -+ const ZSTD_longOffset_e isLongOffset) - { - DEBUGLOG(5, "ZSTD_decompressSequencesLong"); - #if DYNAMIC_BMI2 - if (ZSTD_DCtx_get_bmi2(dctx)) { -- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - #endif -- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); - } - #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ - - -+/* -+ * @returns The total size of the history referenceable by zstd, including -+ * both the prefix and the extDict. At @p op any offset larger than this -+ * is invalid. -+ */ -+static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) -+{ -+ return (size_t)(op - virtualStart); -+} -+ -+typedef struct { -+ unsigned longOffsetShare; -+ unsigned maxNbAdditionalBits; -+} ZSTD_OffsetInfo; - --#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ -- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) --/* ZSTD_getLongOffsetsShare() : -+/* ZSTD_getOffsetInfo() : - * condition : offTable must be valid - * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) -- * compared to maximum possible of (1< 22) total += 1; -+ ZSTD_OffsetInfo info = {0, 0}; -+ /* If nbSeq == 0, then the offTable is uninitialized, but we have -+ * no sequences, so both values should be 0. -+ */ -+ if (nbSeq != 0) { -+ const void* ptr = offTable; -+ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; -+ const ZSTD_seqSymbol* table = offTable + 1; -+ U32 const max = 1 << tableLog; -+ U32 u; -+ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); -+ -+ assert(max <= (1 << OffFSELog)); /* max not too large */ -+ for (u=0; u 22) info.longOffsetShare += 1; -+ } -+ -+ assert(tableLog <= OffFSELog); -+ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ - } - -- assert(tableLog <= OffFSELog); -- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ -+ return info; -+} - -- return total; -+/* -+ * @returns The maximum offset we can decode in one read of our bitstream, without -+ * reloading more bits in the middle of the offset bits read. Any offsets larger -+ * than this must use the long offset decoder. -+ */ -+static size_t ZSTD_maxShortOffset(void) -+{ -+ if (MEM_64bits()) { -+ /* We can decode any offset without reloading bits. -+ * This might change if the max window size grows. -+ */ -+ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); -+ return (size_t)-1; -+ } else { -+ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. -+ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. -+ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. -+ */ -+ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; -+ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; -+ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); -+ return maxOffset; -+ } - } --#endif - - size_t - ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, - void* dst, size_t dstCapacity, -- const void* src, size_t srcSize, const int frame, const streaming_operation streaming) -+ const void* src, size_t srcSize, const streaming_operation streaming) - { /* blockType == blockCompressed */ - const BYTE* ip = (const BYTE*)src; -- /* isLongOffset must be true if there are long offsets. -- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. -- * We don't expect that to be the case in 64-bit mode. -- * In block mode, window size is not known, so we have to be conservative. -- * (note: but it could be evaluated from current-lowLimit) -- */ -- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); -- DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); -- -- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); -+ DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); -+ -+ /* Note : the wording of the specification -+ * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). -+ * This generally does not happen, as it makes little sense, -+ * since an uncompressed block would feature same size and have no decompression cost. -+ * Also, note that decoder from reference libzstd before < v1.5.4 -+ * would consider this edge case as an error. -+ * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) -+ * for broader compatibility with the deployed ecosystem of zstd decoders */ -+ RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); - - /* Decode literals section */ - { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); -- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); -+ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); - if (ZSTD_isError(litCSize)) return litCSize; - ip += litCSize; - srcSize -= litCSize; -@@ -2001,6 +2097,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, - - /* Build Decoding Tables */ - { -+ /* Compute the maximum block size, which must also work when !frame and fParams are unset. -+ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. -+ */ -+ size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); -+ size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); -+ /* isLongOffset must be true if there are long offsets. -+ * Offsets are long if they are larger than ZSTD_maxShortOffset(). -+ * We don't expect that to be the case in 64-bit mode. -+ * -+ * We check here to see if our history is large enough to allow long offsets. -+ * If it isn't, then we can't possible have (valid) long offsets. If the offset -+ * is invalid, then it is okay to read it incorrectly. -+ * -+ * If isLongOffsets is true, then we will later check our decoding table to see -+ * if it is even possible to generate long offsets. -+ */ -+ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); - /* These macros control at build-time which decompressor implementation - * we use. If neither is defined, we do some inspection and dispatch at - * runtime. -@@ -2008,6 +2121,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, - #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ - !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) - int usePrefetchDecoder = dctx->ddictIsCold; -+#else -+ /* Set to 1 to avoid computing offset info if we don't need to. -+ * Otherwise this value is ignored. -+ */ -+ int usePrefetchDecoder = 1; - #endif - int nbSeq; - size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); -@@ -2015,40 +2133,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, - ip += seqHSize; - srcSize -= seqHSize; - -- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); -+ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); -+ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, -+ "invalid dst"); - --#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ -- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) -- if ( !usePrefetchDecoder -- && (!frame || (dctx->fParams.windowSize > (1<<24))) -- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ -- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); -- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ -- usePrefetchDecoder = (shareLongOffsets >= minShare); -+ /* If we could potentially have long offsets, or we might want to use the prefetch decoder, -+ * compute information about the share of long offsets, and the maximum nbAdditionalBits. -+ * NOTE: could probably use a larger nbSeq limit -+ */ -+ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { -+ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); -+ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { -+ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small -+ * enough, then we know it is impossible to have too long an offset in this block, so we can -+ * use the regular offset decoder. -+ */ -+ isLongOffset = ZSTD_lo_isRegularOffset; -+ } -+ if (!usePrefetchDecoder) { -+ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ -+ usePrefetchDecoder = (info.longOffsetShare >= minShare); -+ } - } --#endif - - dctx->ddictIsCold = 0; - - #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ - !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) -- if (usePrefetchDecoder) -+ if (usePrefetchDecoder) { -+#else -+ (void)usePrefetchDecoder; -+ { - #endif - #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT -- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); - #endif -+ } - - #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG - /* else */ - if (dctx->litBufferLocation == ZSTD_split) -- return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); - else -- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); -+ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); - #endif - } - } - - -+ZSTD_ALLOW_POINTER_OVERFLOW_ATTR - void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) - { - if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ -@@ -2060,13 +2193,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) - } - - --size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, -- void* dst, size_t dstCapacity, -- const void* src, size_t srcSize) -+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, -+ void* dst, size_t dstCapacity, -+ const void* src, size_t srcSize) - { - size_t dSize; -+ dctx->isFrameDecompression = 0; - ZSTD_checkContinuity(dctx, dst, dstCapacity); -- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming); -+ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); -+ FORWARD_IF_ERROR(dSize, ""); - dctx->previousDstEnd = (char*)dst + dSize; - return dSize; - } -+ -+ -+/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ -+size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, -+ void* dst, size_t dstCapacity, -+ const void* src, size_t srcSize) -+{ -+ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); -+} -diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h -index 3d2d57a5d25a..becffbd89364 100644 ---- a/lib/zstd/decompress/zstd_decompress_block.h -+++ b/lib/zstd/decompress/zstd_decompress_block.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -47,7 +48,7 @@ typedef enum { - */ - size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, - void* dst, size_t dstCapacity, -- const void* src, size_t srcSize, const int frame, const streaming_operation streaming); -+ const void* src, size_t srcSize, const streaming_operation streaming); - - /* ZSTD_buildFSETable() : - * generate FSE decoding table for one symbol (ll, ml or off) -@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, - unsigned tableLog, void* wksp, size_t wkspSize, - int bmi2); - -+/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */ -+size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, -+ void* dst, size_t dstCapacity, -+ const void* src, size_t srcSize); -+ - - #endif /* ZSTD_DEC_BLOCK_H */ -diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h -index 98102edb6a83..0f02526be774 100644 ---- a/lib/zstd/decompress/zstd_decompress_internal.h -+++ b/lib/zstd/decompress/zstd_decompress_internal.h -@@ -1,5 +1,6 @@ -+/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Yann Collet, Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = { - - #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64)) - #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32)) -+#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 - - typedef struct { - ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ - ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */ - ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ -- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ -+ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */ - U32 rep[ZSTD_REP_NUM]; - U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32]; - } ZSTD_entropyDTables_t; -@@ -152,6 +154,7 @@ struct ZSTD_DCtx_s - size_t litSize; - size_t rleSize; - size_t staticSize; -+ int isFrameDecompression; - #if DYNAMIC_BMI2 != 0 - int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ - #endif -@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s - ZSTD_dictUses_e dictUses; - ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ - ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ -+ int disableHufAsm; -+ int maxBlockSizeParam; - - /* streaming */ - ZSTD_dStreamStage streamStage; -diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h -index a06ca187aab5..8a47eb2a4514 100644 ---- a/lib/zstd/decompress_sources.h -+++ b/lib/zstd/decompress_sources.h -@@ -1,6 +1,6 @@ - /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ - /* -- * Copyright (c) Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c -index 22686e367e6f..466828e35752 100644 ---- a/lib/zstd/zstd_common_module.c -+++ b/lib/zstd/zstd_common_module.c -@@ -1,6 +1,6 @@ - // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp); - EXPORT_SYMBOL_GPL(ZSTD_isError); - EXPORT_SYMBOL_GPL(ZSTD_getErrorName); - EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); --EXPORT_SYMBOL_GPL(ZSTD_customMalloc); --EXPORT_SYMBOL_GPL(ZSTD_customCalloc); --EXPORT_SYMBOL_GPL(ZSTD_customFree); - - MODULE_LICENSE("Dual BSD/GPL"); - MODULE_DESCRIPTION("Zstd Common"); -diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c -index bd8784449b31..ceaf352d03e2 100644 ---- a/lib/zstd/zstd_compress_module.c -+++ b/lib/zstd/zstd_compress_module.c -@@ -1,6 +1,6 @@ - // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c -index 469fc3059be0..0ae819f0c927 100644 ---- a/lib/zstd/zstd_decompress_module.c -+++ b/lib/zstd/zstd_decompress_module.c -@@ -1,6 +1,6 @@ - // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause - /* -- * Copyright (c) Facebook, Inc. -+ * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the -@@ -113,7 +113,7 @@ EXPORT_SYMBOL(zstd_init_dstream); - - size_t zstd_reset_dstream(zstd_dstream *dstream) - { -- return ZSTD_resetDStream(dstream); -+ return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only); - } - EXPORT_SYMBOL(zstd_reset_dstream); - --- -2.47.0.rc0 - diff --git a/patches/sys-kernel/gentoo-sources/v4l2loopback.patch b/patches/sys-kernel/gentoo-sources/v4l2loopback.patch deleted file mode 100644 index bff5ca3..0000000 --- a/patches/sys-kernel/gentoo-sources/v4l2loopback.patch +++ /dev/null @@ -1,3767 +0,0 @@ -From ad3c47e37228d67a40b588732a87134b3ba51c68 Mon Sep 17 00:00:00 2001 -From: Oleksandr Natalenko -Date: Mon, 22 Jan 2024 18:14:26 +0100 -Subject: [PATCH] media: v4l2-core: add v4l2loopback - -Signed-off-by: Oleksandr Natalenko ---- - drivers/media/v4l2-core/Kconfig | 5 + - drivers/media/v4l2-core/Makefile | 2 + - drivers/media/v4l2-core/v4l2loopback.c | 3161 +++++++++++++++++ - drivers/media/v4l2-core/v4l2loopback.h | 98 + - .../media/v4l2-core/v4l2loopback_formats.h | 445 +++ - 5 files changed, 3711 insertions(+) - create mode 100644 drivers/media/v4l2-core/v4l2loopback.c - create mode 100644 drivers/media/v4l2-core/v4l2loopback.h - create mode 100644 drivers/media/v4l2-core/v4l2loopback_formats.h - -diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig -index 331b8e535e5b..80dabeebf580 100644 ---- a/drivers/media/v4l2-core/Kconfig -+++ b/drivers/media/v4l2-core/Kconfig -@@ -40,6 +40,11 @@ config VIDEO_TUNER - config V4L2_JPEG_HELPER - tristate - -+config V4L2_LOOPBACK -+ tristate "V4L2 loopback device" -+ help -+ V4L2 loopback device -+ - # Used by drivers that need v4l2-h264.ko - config V4L2_H264 - tristate -diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile -index 2177b9d63a8f..c179507cedc4 100644 ---- a/drivers/media/v4l2-core/Makefile -+++ b/drivers/media/v4l2-core/Makefile -@@ -33,5 +33,7 @@ obj-$(CONFIG_V4L2_JPEG_HELPER) += v4l2-jpeg.o - obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o - obj-$(CONFIG_V4L2_VP9) += v4l2-vp9.o - -+obj-$(CONFIG_V4L2_LOOPBACK) += v4l2loopback.o -+ - obj-$(CONFIG_VIDEO_TUNER) += tuner.o - obj-$(CONFIG_VIDEO_DEV) += v4l2-dv-timings.o videodev.o -diff --git a/drivers/media/v4l2-core/v4l2loopback.c b/drivers/media/v4l2-core/v4l2loopback.c -new file mode 100644 -index 000000000000..01d5fe7ce88b ---- /dev/null -+++ b/drivers/media/v4l2-core/v4l2loopback.c -@@ -0,0 +1,3161 @@ -+/* -*- c-file-style: "linux" -*- */ -+/* -+ * v4l2loopback.c -- video4linux2 loopback driver -+ * -+ * Copyright (C) 2005-2009 Vasily Levin (vasaka@gmail.com) -+ * Copyright (C) 2010-2023 IOhannes m zmoelnig (zmoelnig@iem.at) -+ * Copyright (C) 2011 Stefan Diewald (stefan.diewald@mytum.de) -+ * Copyright (C) 2012 Anton Novikov (random.plant@gmail.com) -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include "v4l2loopback.h" -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) -+#error This module is not supported on kernels before 4.0.0. -+#endif -+ -+#if defined(timer_setup) && defined(from_timer) -+#define HAVE_TIMER_SETUP -+#endif -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) -+#define VFL_TYPE_VIDEO VFL_TYPE_GRABBER -+#endif -+ -+#define V4L2LOOPBACK_VERSION_CODE \ -+ KERNEL_VERSION(V4L2LOOPBACK_VERSION_MAJOR, V4L2LOOPBACK_VERSION_MINOR, \ -+ V4L2LOOPBACK_VERSION_BUGFIX) -+ -+MODULE_DESCRIPTION("V4L2 loopback video device"); -+MODULE_AUTHOR("Vasily Levin, " -+ "IOhannes m zmoelnig ," -+ "Stefan Diewald," -+ "Anton Novikov" -+ "et al."); -+#ifdef SNAPSHOT_VERSION -+MODULE_VERSION(__stringify(SNAPSHOT_VERSION)); -+#else -+MODULE_VERSION("" __stringify(V4L2LOOPBACK_VERSION_MAJOR) "." __stringify( -+ V4L2LOOPBACK_VERSION_MINOR) "." __stringify(V4L2LOOPBACK_VERSION_BUGFIX)); -+#endif -+MODULE_LICENSE("GPL"); -+ -+/* -+ * helpers -+ */ -+#define dprintk(fmt, args...) \ -+ do { \ -+ if (debug > 0) { \ -+ printk(KERN_INFO "v4l2-loopback[" __stringify( \ -+ __LINE__) "], pid(%d): " fmt, \ -+ task_pid_nr(current), ##args); \ -+ } \ -+ } while (0) -+ -+#define MARK() \ -+ do { \ -+ if (debug > 1) { \ -+ printk(KERN_INFO "%s:%d[%s], pid(%d)\n", __FILE__, \ -+ __LINE__, __func__, task_pid_nr(current)); \ -+ } \ -+ } while (0) -+ -+#define dprintkrw(fmt, args...) \ -+ do { \ -+ if (debug > 2) { \ -+ printk(KERN_INFO "v4l2-loopback[" __stringify( \ -+ __LINE__) "], pid(%d): " fmt, \ -+ task_pid_nr(current), ##args); \ -+ } \ -+ } while (0) -+ -+static inline void v4l2l_get_timestamp(struct v4l2_buffer *b) -+{ -+ struct timespec64 ts; -+ ktime_get_ts64(&ts); -+ -+ b->timestamp.tv_sec = ts.tv_sec; -+ b->timestamp.tv_usec = (ts.tv_nsec / NSEC_PER_USEC); -+ b->flags |= V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; -+} -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0) -+typedef unsigned __poll_t; -+#endif -+ -+/* module constants -+ * can be overridden during he build process using something like -+ * make KCPPFLAGS="-DMAX_DEVICES=100" -+ */ -+ -+/* maximum number of v4l2loopback devices that can be created */ -+#ifndef MAX_DEVICES -+#define MAX_DEVICES 8 -+#endif -+ -+/* whether the default is to announce capabilities exclusively or not */ -+#ifndef V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS -+#define V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS 0 -+#endif -+ -+/* when a producer is considered to have gone stale */ -+#ifndef MAX_TIMEOUT -+#define MAX_TIMEOUT (100 * 1000) /* in msecs */ -+#endif -+ -+/* max buffers that can be mapped, actually they -+ * are all mapped to max_buffers buffers */ -+#ifndef MAX_BUFFERS -+#define MAX_BUFFERS 32 -+#endif -+ -+/* module parameters */ -+static int debug = 0; -+module_param(debug, int, S_IRUGO | S_IWUSR); -+MODULE_PARM_DESC(debug, "debugging level (higher values == more verbose)"); -+ -+#define V4L2LOOPBACK_DEFAULT_MAX_BUFFERS 2 -+static int max_buffers = V4L2LOOPBACK_DEFAULT_MAX_BUFFERS; -+module_param(max_buffers, int, S_IRUGO); -+MODULE_PARM_DESC(max_buffers, -+ "how many buffers should be allocated [DEFAULT: " __stringify( -+ V4L2LOOPBACK_DEFAULT_MAX_BUFFERS) "]"); -+ -+/* how many times a device can be opened -+ * the per-module default value can be overridden on a per-device basis using -+ * the /sys/devices interface -+ * -+ * note that max_openers should be at least 2 in order to get a working system: -+ * one opener for the producer and one opener for the consumer -+ * however, we leave that to the user -+ */ -+#define V4L2LOOPBACK_DEFAULT_MAX_OPENERS 10 -+static int max_openers = V4L2LOOPBACK_DEFAULT_MAX_OPENERS; -+module_param(max_openers, int, S_IRUGO | S_IWUSR); -+MODULE_PARM_DESC( -+ max_openers, -+ "how many users can open the loopback device [DEFAULT: " __stringify( -+ V4L2LOOPBACK_DEFAULT_MAX_OPENERS) "]"); -+ -+static int devices = -1; -+module_param(devices, int, 0); -+MODULE_PARM_DESC(devices, "how many devices should be created"); -+ -+static int video_nr[MAX_DEVICES] = { [0 ...(MAX_DEVICES - 1)] = -1 }; -+module_param_array(video_nr, int, NULL, 0444); -+MODULE_PARM_DESC(video_nr, -+ "video device numbers (-1=auto, 0=/dev/video0, etc.)"); -+ -+static char *card_label[MAX_DEVICES]; -+module_param_array(card_label, charp, NULL, 0000); -+MODULE_PARM_DESC(card_label, "card labels for each device"); -+ -+static bool exclusive_caps[MAX_DEVICES] = { -+ [0 ...(MAX_DEVICES - 1)] = V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS -+}; -+module_param_array(exclusive_caps, bool, NULL, 0444); -+/* FIXXME: wording */ -+MODULE_PARM_DESC( -+ exclusive_caps, -+ "whether to announce OUTPUT/CAPTURE capabilities exclusively or not [DEFAULT: " __stringify( -+ V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS) "]"); -+ -+/* format specifications */ -+#define V4L2LOOPBACK_SIZE_MIN_WIDTH 48 -+#define V4L2LOOPBACK_SIZE_MIN_HEIGHT 32 -+#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH 8192 -+#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT 8192 -+ -+#define V4L2LOOPBACK_SIZE_DEFAULT_WIDTH 640 -+#define V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT 480 -+ -+static int max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH; -+module_param(max_width, int, S_IRUGO); -+MODULE_PARM_DESC(max_width, -+ "maximum allowed frame width [DEFAULT: " __stringify( -+ V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH) "]"); -+static int max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT; -+module_param(max_height, int, S_IRUGO); -+MODULE_PARM_DESC(max_height, -+ "maximum allowed frame height [DEFAULT: " __stringify( -+ V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT) "]"); -+ -+static DEFINE_IDR(v4l2loopback_index_idr); -+static DEFINE_MUTEX(v4l2loopback_ctl_mutex); -+ -+/* frame intervals */ -+#define V4L2LOOPBACK_FPS_MIN 0 -+#define V4L2LOOPBACK_FPS_MAX 1000 -+ -+/* control IDs */ -+#define V4L2LOOPBACK_CID_BASE (V4L2_CID_USER_BASE | 0xf000) -+#define CID_KEEP_FORMAT (V4L2LOOPBACK_CID_BASE + 0) -+#define CID_SUSTAIN_FRAMERATE (V4L2LOOPBACK_CID_BASE + 1) -+#define CID_TIMEOUT (V4L2LOOPBACK_CID_BASE + 2) -+#define CID_TIMEOUT_IMAGE_IO (V4L2LOOPBACK_CID_BASE + 3) -+ -+static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl); -+static const struct v4l2_ctrl_ops v4l2loopback_ctrl_ops = { -+ .s_ctrl = v4l2loopback_s_ctrl, -+}; -+static const struct v4l2_ctrl_config v4l2loopback_ctrl_keepformat = { -+ // clang-format off -+ .ops = &v4l2loopback_ctrl_ops, -+ .id = CID_KEEP_FORMAT, -+ .name = "keep_format", -+ .type = V4L2_CTRL_TYPE_BOOLEAN, -+ .min = 0, -+ .max = 1, -+ .step = 1, -+ .def = 0, -+ // clang-format on -+}; -+static const struct v4l2_ctrl_config v4l2loopback_ctrl_sustainframerate = { -+ // clang-format off -+ .ops = &v4l2loopback_ctrl_ops, -+ .id = CID_SUSTAIN_FRAMERATE, -+ .name = "sustain_framerate", -+ .type = V4L2_CTRL_TYPE_BOOLEAN, -+ .min = 0, -+ .max = 1, -+ .step = 1, -+ .def = 0, -+ // clang-format on -+}; -+static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeout = { -+ // clang-format off -+ .ops = &v4l2loopback_ctrl_ops, -+ .id = CID_TIMEOUT, -+ .name = "timeout", -+ .type = V4L2_CTRL_TYPE_INTEGER, -+ .min = 0, -+ .max = MAX_TIMEOUT, -+ .step = 1, -+ .def = 0, -+ // clang-format on -+}; -+static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeoutimageio = { -+ // clang-format off -+ .ops = &v4l2loopback_ctrl_ops, -+ .id = CID_TIMEOUT_IMAGE_IO, -+ .name = "timeout_image_io", -+ .type = V4L2_CTRL_TYPE_BUTTON, -+ .min = 0, -+ .max = 1, -+ .step = 1, -+ .def = 0, -+ // clang-format on -+}; -+ -+/* module structures */ -+struct v4l2loopback_private { -+ int device_nr; -+}; -+ -+/* TODO(vasaka) use typenames which are common to kernel, but first find out if -+ * it is needed */ -+/* struct keeping state and settings of loopback device */ -+ -+struct v4l2l_buffer { -+ struct v4l2_buffer buffer; -+ struct list_head list_head; -+ int use_count; -+}; -+ -+struct v4l2_loopback_device { -+ struct v4l2_device v4l2_dev; -+ struct v4l2_ctrl_handler ctrl_handler; -+ struct video_device *vdev; -+ /* pixel and stream format */ -+ struct v4l2_pix_format pix_format; -+ bool pix_format_has_valid_sizeimage; -+ struct v4l2_captureparm capture_param; -+ unsigned long frame_jiffies; -+ -+ /* ctrls */ -+ int keep_format; /* CID_KEEP_FORMAT; stay ready_for_capture even when all -+ openers close() the device */ -+ int sustain_framerate; /* CID_SUSTAIN_FRAMERATE; duplicate frames to maintain -+ (close to) nominal framerate */ -+ -+ /* buffers stuff */ -+ u8 *image; /* pointer to actual buffers data */ -+ unsigned long int imagesize; /* size of buffers data */ -+ int buffers_number; /* should not be big, 4 is a good choice */ -+ struct v4l2l_buffer buffers[MAX_BUFFERS]; /* inner driver buffers */ -+ int used_buffers; /* number of the actually used buffers */ -+ int max_openers; /* how many times can this device be opened */ -+ -+ s64 write_position; /* number of last written frame + 1 */ -+ struct list_head outbufs_list; /* buffers in output DQBUF order */ -+ int bufpos2index -+ [MAX_BUFFERS]; /* mapping of (read/write_position % used_buffers) -+ * to inner buffer index */ -+ long buffer_size; -+ -+ /* sustain_framerate stuff */ -+ struct timer_list sustain_timer; -+ unsigned int reread_count; -+ -+ /* timeout stuff */ -+ unsigned long timeout_jiffies; /* CID_TIMEOUT; 0 means disabled */ -+ int timeout_image_io; /* CID_TIMEOUT_IMAGE_IO; next opener will -+ * read/write to timeout_image */ -+ u8 *timeout_image; /* copy of it will be captured when timeout passes */ -+ struct v4l2l_buffer timeout_image_buffer; -+ struct timer_list timeout_timer; -+ int timeout_happened; -+ -+ /* sync stuff */ -+ atomic_t open_count; -+ -+ int ready_for_capture; /* set to the number of writers that opened the -+ * device and negotiated format. */ -+ int ready_for_output; /* set to true when no writer is currently attached -+ * this differs slightly from !ready_for_capture, -+ * e.g. when using fallback images */ -+ int active_readers; /* increase if any reader starts streaming */ -+ int announce_all_caps; /* set to false, if device caps (OUTPUT/CAPTURE) -+ * should only be announced if the resp. "ready" -+ * flag is set; default=TRUE */ -+ -+ int min_width, max_width; -+ int min_height, max_height; -+ -+ char card_label[32]; -+ -+ wait_queue_head_t read_event; -+ spinlock_t lock, list_lock; -+}; -+ -+/* types of opener shows what opener wants to do with loopback */ -+enum opener_type { -+ // clang-format off -+ UNNEGOTIATED = 0, -+ READER = 1, -+ WRITER = 2, -+ // clang-format on -+}; -+ -+/* struct keeping state and type of opener */ -+struct v4l2_loopback_opener { -+ enum opener_type type; -+ s64 read_position; /* number of last processed frame + 1 or -+ * write_position - 1 if reader went out of sync */ -+ unsigned int reread_count; -+ struct v4l2_buffer *buffers; -+ int buffers_number; /* should not be big, 4 is a good choice */ -+ int timeout_image_io; -+ -+ struct v4l2_fh fh; -+}; -+ -+#define fh_to_opener(ptr) container_of((ptr), struct v4l2_loopback_opener, fh) -+ -+/* this is heavily inspired by the bttv driver found in the linux kernel */ -+struct v4l2l_format { -+ char *name; -+ int fourcc; /* video4linux 2 */ -+ int depth; /* bit/pixel */ -+ int flags; -+}; -+/* set the v4l2l_format.flags to PLANAR for non-packed formats */ -+#define FORMAT_FLAGS_PLANAR 0x01 -+#define FORMAT_FLAGS_COMPRESSED 0x02 -+ -+#include "v4l2loopback_formats.h" -+ -+#ifndef V4L2_TYPE_IS_CAPTURE -+#define V4L2_TYPE_IS_CAPTURE(type) \ -+ ((type) == V4L2_BUF_TYPE_VIDEO_CAPTURE || \ -+ (type) == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) -+#endif /* V4L2_TYPE_IS_CAPTURE */ -+#ifndef V4L2_TYPE_IS_OUTPUT -+#define V4L2_TYPE_IS_OUTPUT(type) \ -+ ((type) == V4L2_BUF_TYPE_VIDEO_OUTPUT || \ -+ (type) == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) -+#endif /* V4L2_TYPE_IS_OUTPUT */ -+ -+/* whether the format can be changed */ -+/* the format is fixated if we -+ - have writers (ready_for_capture>0) -+ - and/or have readers (active_readers>0) -+*/ -+#define V4L2LOOPBACK_IS_FIXED_FMT(device) \ -+ (device->ready_for_capture > 0 || device->active_readers > 0 || \ -+ device->keep_format) -+ -+static const unsigned int FORMATS = ARRAY_SIZE(formats); -+ -+static char *fourcc2str(unsigned int fourcc, char buf[4]) -+{ -+ buf[0] = (fourcc >> 0) & 0xFF; -+ buf[1] = (fourcc >> 8) & 0xFF; -+ buf[2] = (fourcc >> 16) & 0xFF; -+ buf[3] = (fourcc >> 24) & 0xFF; -+ -+ return buf; -+} -+ -+static const struct v4l2l_format *format_by_fourcc(int fourcc) -+{ -+ unsigned int i; -+ -+ for (i = 0; i < FORMATS; i++) { -+ if (formats[i].fourcc == fourcc) -+ return formats + i; -+ } -+ -+ dprintk("unsupported format '%c%c%c%c'\n", (fourcc >> 0) & 0xFF, -+ (fourcc >> 8) & 0xFF, (fourcc >> 16) & 0xFF, -+ (fourcc >> 24) & 0xFF); -+ return NULL; -+} -+ -+static void pix_format_set_size(struct v4l2_pix_format *f, -+ const struct v4l2l_format *fmt, -+ unsigned int width, unsigned int height) -+{ -+ f->width = width; -+ f->height = height; -+ -+ if (fmt->flags & FORMAT_FLAGS_PLANAR) { -+ f->bytesperline = width; /* Y plane */ -+ f->sizeimage = (width * height * fmt->depth) >> 3; -+ } else if (fmt->flags & FORMAT_FLAGS_COMPRESSED) { -+ /* doesn't make sense for compressed formats */ -+ f->bytesperline = 0; -+ f->sizeimage = (width * height * fmt->depth) >> 3; -+ } else { -+ f->bytesperline = (width * fmt->depth) >> 3; -+ f->sizeimage = height * f->bytesperline; -+ } -+} -+ -+static int v4l2l_fill_format(struct v4l2_format *fmt, int capture, -+ const u32 minwidth, const u32 maxwidth, -+ const u32 minheight, const u32 maxheight) -+{ -+ u32 width = fmt->fmt.pix.width, height = fmt->fmt.pix.height; -+ u32 pixelformat = fmt->fmt.pix.pixelformat; -+ struct v4l2_format fmt0 = *fmt; -+ u32 bytesperline = 0, sizeimage = 0; -+ if (!width) -+ width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; -+ if (!height) -+ height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; -+ if (width < minwidth) -+ width = minwidth; -+ if (width > maxwidth) -+ width = maxwidth; -+ if (height < minheight) -+ height = minheight; -+ if (height > maxheight) -+ height = maxheight; -+ -+ /* sets: width,height,pixelformat,bytesperline,sizeimage */ -+ if (!(V4L2_TYPE_IS_MULTIPLANAR(fmt0.type))) { -+ fmt0.fmt.pix.bytesperline = 0; -+ fmt0.fmt.pix.sizeimage = 0; -+ } -+ -+ if (0) { -+ ; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) -+ } else if (!v4l2_fill_pixfmt(&fmt0.fmt.pix, pixelformat, width, -+ height)) { -+ ; -+ } else if (!v4l2_fill_pixfmt_mp(&fmt0.fmt.pix_mp, pixelformat, width, -+ height)) { -+ ; -+#endif -+ } else { -+ const struct v4l2l_format *format = -+ format_by_fourcc(pixelformat); -+ if (!format) -+ return -EINVAL; -+ pix_format_set_size(&fmt0.fmt.pix, format, width, height); -+ fmt0.fmt.pix.pixelformat = format->fourcc; -+ } -+ -+ if (V4L2_TYPE_IS_MULTIPLANAR(fmt0.type)) { -+ *fmt = fmt0; -+ -+ if ((fmt->fmt.pix_mp.colorspace == V4L2_COLORSPACE_DEFAULT) || -+ (fmt->fmt.pix_mp.colorspace > V4L2_COLORSPACE_DCI_P3)) -+ fmt->fmt.pix_mp.colorspace = V4L2_COLORSPACE_SRGB; -+ if (V4L2_FIELD_ANY == fmt->fmt.pix_mp.field) -+ fmt->fmt.pix_mp.field = V4L2_FIELD_NONE; -+ if (capture) -+ fmt->type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; -+ else -+ fmt->type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; -+ } else { -+ bytesperline = fmt->fmt.pix.bytesperline; -+ sizeimage = fmt->fmt.pix.sizeimage; -+ -+ *fmt = fmt0; -+ -+ if (!fmt->fmt.pix.bytesperline) -+ fmt->fmt.pix.bytesperline = bytesperline; -+ if (!fmt->fmt.pix.sizeimage) -+ fmt->fmt.pix.sizeimage = sizeimage; -+ -+ if ((fmt->fmt.pix.colorspace == V4L2_COLORSPACE_DEFAULT) || -+ (fmt->fmt.pix.colorspace > V4L2_COLORSPACE_DCI_P3)) -+ fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB; -+ if (V4L2_FIELD_ANY == fmt->fmt.pix.field) -+ fmt->fmt.pix.field = V4L2_FIELD_NONE; -+ if (capture) -+ fmt->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -+ else -+ fmt->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; -+ } -+ -+ return 0; -+} -+ -+/* Checks if v4l2l_fill_format() has set a valid, fixed sizeimage val. */ -+static bool v4l2l_pix_format_has_valid_sizeimage(struct v4l2_format *fmt) -+{ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) -+ const struct v4l2_format_info *info; -+ -+ info = v4l2_format_info(fmt->fmt.pix.pixelformat); -+ if (info && info->mem_planes == 1) -+ return true; -+#endif -+ -+ return false; -+} -+ -+static int pix_format_eq(const struct v4l2_pix_format *ref, -+ const struct v4l2_pix_format *tgt, int strict) -+{ -+ /* check if the two formats are equivalent. -+ * ANY fields are handled gracefully -+ */ -+#define _pix_format_eq0(x) \ -+ if (ref->x != tgt->x) \ -+ result = 0 -+#define _pix_format_eq1(x, def) \ -+ do { \ -+ if ((def != tgt->x) && (ref->x != tgt->x)) { \ -+ printk(KERN_INFO #x " failed"); \ -+ result = 0; \ -+ } \ -+ } while (0) -+ int result = 1; -+ _pix_format_eq0(width); -+ _pix_format_eq0(height); -+ _pix_format_eq0(pixelformat); -+ if (!strict) -+ return result; -+ _pix_format_eq1(field, V4L2_FIELD_ANY); -+ _pix_format_eq0(bytesperline); -+ _pix_format_eq0(sizeimage); -+ _pix_format_eq1(colorspace, V4L2_COLORSPACE_DEFAULT); -+ return result; -+} -+ -+static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f); -+static int inner_try_setfmt(struct file *file, struct v4l2_format *fmt) -+{ -+ int capture = V4L2_TYPE_IS_CAPTURE(fmt->type); -+ struct v4l2_loopback_device *dev; -+ int needschange = 0; -+ char buf[5]; -+ buf[4] = 0; -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ needschange = !(pix_format_eq(&dev->pix_format, &fmt->fmt.pix, 0)); -+ if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) { -+ fmt->fmt.pix = dev->pix_format; -+ if (needschange) { -+ if (dev->active_readers > 0 && capture) { -+ /* cannot call fmt_cap while there are readers */ -+ return -EBUSY; -+ } -+ if (dev->ready_for_capture > 0 && !capture) { -+ /* cannot call fmt_out while there are writers */ -+ return -EBUSY; -+ } -+ } -+ } -+ if (v4l2l_fill_format(fmt, capture, dev->min_width, dev->max_width, -+ dev->min_height, dev->max_height) != 0) { -+ return -EINVAL; -+ } -+ -+ if (1) { -+ char buf[5]; -+ buf[4] = 0; -+ dprintk("capFOURCC=%s\n", -+ fourcc2str(dev->pix_format.pixelformat, buf)); -+ } -+ return 0; -+} -+ -+static int set_timeperframe(struct v4l2_loopback_device *dev, -+ struct v4l2_fract *tpf) -+{ -+ if ((tpf->denominator < 1) || (tpf->numerator < 1)) { -+ return -EINVAL; -+ } -+ dev->capture_param.timeperframe = *tpf; -+ dev->frame_jiffies = max(1UL, msecs_to_jiffies(1000) * tpf->numerator / -+ tpf->denominator); -+ return 0; -+} -+ -+static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd); -+ -+/* device attributes */ -+/* available via sysfs: /sys/devices/virtual/video4linux/video* */ -+ -+static ssize_t attr_show_format(struct device *cd, -+ struct device_attribute *attr, char *buf) -+{ -+ /* gets the current format as "FOURCC:WxH@f/s", e.g. "YUYV:320x240@1000/30" */ -+ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); -+ const struct v4l2_fract *tpf; -+ char buf4cc[5], buf_fps[32]; -+ -+ if (!dev || !V4L2LOOPBACK_IS_FIXED_FMT(dev)) -+ return 0; -+ tpf = &dev->capture_param.timeperframe; -+ -+ fourcc2str(dev->pix_format.pixelformat, buf4cc); -+ buf4cc[4] = 0; -+ if (tpf->numerator == 1) -+ snprintf(buf_fps, sizeof(buf_fps), "%d", tpf->denominator); -+ else -+ snprintf(buf_fps, sizeof(buf_fps), "%d/%d", tpf->denominator, -+ tpf->numerator); -+ return sprintf(buf, "%4s:%dx%d@%s\n", buf4cc, dev->pix_format.width, -+ dev->pix_format.height, buf_fps); -+} -+ -+static ssize_t attr_store_format(struct device *cd, -+ struct device_attribute *attr, const char *buf, -+ size_t len) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); -+ int fps_num = 0, fps_den = 1; -+ -+ if (!dev) -+ return -ENODEV; -+ -+ /* only fps changing is supported */ -+ if (sscanf(buf, "@%d/%d", &fps_num, &fps_den) > 0) { -+ struct v4l2_fract f = { .numerator = fps_den, -+ .denominator = fps_num }; -+ int err = 0; -+ if ((err = set_timeperframe(dev, &f)) < 0) -+ return err; -+ return len; -+ } -+ return -EINVAL; -+} -+ -+static DEVICE_ATTR(format, S_IRUGO | S_IWUSR, attr_show_format, -+ attr_store_format); -+ -+static ssize_t attr_show_buffers(struct device *cd, -+ struct device_attribute *attr, char *buf) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); -+ -+ if (!dev) -+ return -ENODEV; -+ -+ return sprintf(buf, "%d\n", dev->used_buffers); -+} -+ -+static DEVICE_ATTR(buffers, S_IRUGO, attr_show_buffers, NULL); -+ -+static ssize_t attr_show_maxopeners(struct device *cd, -+ struct device_attribute *attr, char *buf) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); -+ -+ if (!dev) -+ return -ENODEV; -+ -+ return sprintf(buf, "%d\n", dev->max_openers); -+} -+ -+static ssize_t attr_store_maxopeners(struct device *cd, -+ struct device_attribute *attr, -+ const char *buf, size_t len) -+{ -+ struct v4l2_loopback_device *dev = NULL; -+ unsigned long curr = 0; -+ -+ if (kstrtoul(buf, 0, &curr)) -+ return -EINVAL; -+ -+ dev = v4l2loopback_cd2dev(cd); -+ if (!dev) -+ return -ENODEV; -+ -+ if (dev->max_openers == curr) -+ return len; -+ -+ if (curr > __INT_MAX__ || dev->open_count.counter > curr) { -+ /* request to limit to less openers as are currently attached to us */ -+ return -EINVAL; -+ } -+ -+ dev->max_openers = (int)curr; -+ -+ return len; -+} -+ -+static DEVICE_ATTR(max_openers, S_IRUGO | S_IWUSR, attr_show_maxopeners, -+ attr_store_maxopeners); -+ -+static ssize_t attr_show_state(struct device *cd, struct device_attribute *attr, -+ char *buf) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); -+ -+ if (!dev) -+ return -ENODEV; -+ -+ if (dev->ready_for_capture) -+ return sprintf(buf, "capture\n"); -+ if (dev->ready_for_output) -+ return sprintf(buf, "output\n"); -+ -+ return -EAGAIN; -+} -+ -+static DEVICE_ATTR(state, S_IRUGO, attr_show_state, NULL); -+ -+static void v4l2loopback_remove_sysfs(struct video_device *vdev) -+{ -+#define V4L2_SYSFS_DESTROY(x) device_remove_file(&vdev->dev, &dev_attr_##x) -+ -+ if (vdev) { -+ V4L2_SYSFS_DESTROY(format); -+ V4L2_SYSFS_DESTROY(buffers); -+ V4L2_SYSFS_DESTROY(max_openers); -+ V4L2_SYSFS_DESTROY(state); -+ /* ... */ -+ } -+} -+ -+static void v4l2loopback_create_sysfs(struct video_device *vdev) -+{ -+ int res = 0; -+ -+#define V4L2_SYSFS_CREATE(x) \ -+ res = device_create_file(&vdev->dev, &dev_attr_##x); \ -+ if (res < 0) \ -+ break -+ if (!vdev) -+ return; -+ do { -+ V4L2_SYSFS_CREATE(format); -+ V4L2_SYSFS_CREATE(buffers); -+ V4L2_SYSFS_CREATE(max_openers); -+ V4L2_SYSFS_CREATE(state); -+ /* ... */ -+ } while (0); -+ -+ if (res >= 0) -+ return; -+ dev_err(&vdev->dev, "%s error: %d\n", __func__, res); -+} -+ -+/* Event APIs */ -+ -+#define V4L2LOOPBACK_EVENT_BASE (V4L2_EVENT_PRIVATE_START) -+#define V4L2LOOPBACK_EVENT_OFFSET 0x08E00000 -+#define V4L2_EVENT_PRI_CLIENT_USAGE \ -+ (V4L2LOOPBACK_EVENT_BASE + V4L2LOOPBACK_EVENT_OFFSET + 1) -+ -+struct v4l2_event_client_usage { -+ __u32 count; -+}; -+ -+/* global module data */ -+/* find a device based on it's device-number (e.g. '3' for /dev/video3) */ -+struct v4l2loopback_lookup_cb_data { -+ int device_nr; -+ struct v4l2_loopback_device *device; -+}; -+static int v4l2loopback_lookup_cb(int id, void *ptr, void *data) -+{ -+ struct v4l2_loopback_device *device = ptr; -+ struct v4l2loopback_lookup_cb_data *cbdata = data; -+ if (cbdata && device && device->vdev) { -+ if (device->vdev->num == cbdata->device_nr) { -+ cbdata->device = device; -+ cbdata->device_nr = id; -+ return 1; -+ } -+ } -+ return 0; -+} -+static int v4l2loopback_lookup(int device_nr, -+ struct v4l2_loopback_device **device) -+{ -+ struct v4l2loopback_lookup_cb_data data = { -+ .device_nr = device_nr, -+ .device = NULL, -+ }; -+ int err = idr_for_each(&v4l2loopback_index_idr, &v4l2loopback_lookup_cb, -+ &data); -+ if (1 == err) { -+ if (device) -+ *device = data.device; -+ return data.device_nr; -+ } -+ return -ENODEV; -+} -+static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd) -+{ -+ struct video_device *loopdev = to_video_device(cd); -+ struct v4l2loopback_private *ptr = -+ (struct v4l2loopback_private *)video_get_drvdata(loopdev); -+ int nr = ptr->device_nr; -+ -+ return idr_find(&v4l2loopback_index_idr, nr); -+} -+ -+static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f) -+{ -+ struct v4l2loopback_private *ptr = video_drvdata(f); -+ int nr = ptr->device_nr; -+ -+ return idr_find(&v4l2loopback_index_idr, nr); -+} -+ -+/* forward declarations */ -+static void client_usage_queue_event(struct video_device *vdev); -+static void init_buffers(struct v4l2_loopback_device *dev); -+static int allocate_buffers(struct v4l2_loopback_device *dev); -+static void free_buffers(struct v4l2_loopback_device *dev); -+static void try_free_buffers(struct v4l2_loopback_device *dev); -+static int allocate_timeout_image(struct v4l2_loopback_device *dev); -+static void check_timers(struct v4l2_loopback_device *dev); -+static const struct v4l2_file_operations v4l2_loopback_fops; -+static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops; -+ -+/* Queue helpers */ -+/* next functions sets buffer flags and adjusts counters accordingly */ -+static inline void set_done(struct v4l2l_buffer *buffer) -+{ -+ buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED; -+ buffer->buffer.flags |= V4L2_BUF_FLAG_DONE; -+} -+ -+static inline void set_queued(struct v4l2l_buffer *buffer) -+{ -+ buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE; -+ buffer->buffer.flags |= V4L2_BUF_FLAG_QUEUED; -+} -+ -+static inline void unset_flags(struct v4l2l_buffer *buffer) -+{ -+ buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED; -+ buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE; -+} -+ -+/* V4L2 ioctl caps and params calls */ -+/* returns device capabilities -+ * called on VIDIOC_QUERYCAP -+ */ -+static int vidioc_querycap(struct file *file, void *priv, -+ struct v4l2_capability *cap) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ int device_nr = -+ ((struct v4l2loopback_private *)video_get_drvdata(dev->vdev)) -+ ->device_nr; -+ __u32 capabilities = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE; -+ -+ strscpy(cap->driver, "v4l2 loopback", sizeof(cap->driver)); -+ snprintf(cap->card, sizeof(cap->card), "%s", dev->card_label); -+ snprintf(cap->bus_info, sizeof(cap->bus_info), -+ "platform:v4l2loopback-%03d", device_nr); -+ -+ if (dev->announce_all_caps) { -+ capabilities |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT; -+ } else { -+ if (dev->ready_for_capture) { -+ capabilities |= V4L2_CAP_VIDEO_CAPTURE; -+ } -+ if (dev->ready_for_output) { -+ capabilities |= V4L2_CAP_VIDEO_OUTPUT; -+ } -+ } -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) -+ dev->vdev->device_caps = -+#endif /* >=linux-4.7.0 */ -+ cap->device_caps = cap->capabilities = capabilities; -+ -+ cap->capabilities |= V4L2_CAP_DEVICE_CAPS; -+ -+ memset(cap->reserved, 0, sizeof(cap->reserved)); -+ return 0; -+} -+ -+static int vidioc_enum_framesizes(struct file *file, void *fh, -+ struct v4l2_frmsizeenum *argp) -+{ -+ struct v4l2_loopback_device *dev; -+ -+ /* there can be only one... */ -+ if (argp->index) -+ return -EINVAL; -+ -+ dev = v4l2loopback_getdevice(file); -+ if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) { -+ /* format has already been negotiated -+ * cannot change during runtime -+ */ -+ if (argp->pixel_format != dev->pix_format.pixelformat) -+ return -EINVAL; -+ -+ argp->type = V4L2_FRMSIZE_TYPE_DISCRETE; -+ -+ argp->discrete.width = dev->pix_format.width; -+ argp->discrete.height = dev->pix_format.height; -+ } else { -+ /* if the format has not been negotiated yet, we accept anything -+ */ -+ if (NULL == format_by_fourcc(argp->pixel_format)) -+ return -EINVAL; -+ -+ if (dev->min_width == dev->max_width && -+ dev->min_height == dev->max_height) { -+ argp->type = V4L2_FRMSIZE_TYPE_DISCRETE; -+ -+ argp->discrete.width = dev->min_width; -+ argp->discrete.height = dev->min_height; -+ } else { -+ argp->type = V4L2_FRMSIZE_TYPE_CONTINUOUS; -+ -+ argp->stepwise.min_width = dev->min_width; -+ argp->stepwise.min_height = dev->min_height; -+ -+ argp->stepwise.max_width = dev->max_width; -+ argp->stepwise.max_height = dev->max_height; -+ -+ argp->stepwise.step_width = 1; -+ argp->stepwise.step_height = 1; -+ } -+ } -+ return 0; -+} -+ -+/* returns frameinterval (fps) for the set resolution -+ * called on VIDIOC_ENUM_FRAMEINTERVALS -+ */ -+static int vidioc_enum_frameintervals(struct file *file, void *fh, -+ struct v4l2_frmivalenum *argp) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ -+ /* there can be only one... */ -+ if (argp->index) -+ return -EINVAL; -+ -+ if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) { -+ if (argp->width != dev->pix_format.width || -+ argp->height != dev->pix_format.height || -+ argp->pixel_format != dev->pix_format.pixelformat) -+ return -EINVAL; -+ -+ argp->type = V4L2_FRMIVAL_TYPE_DISCRETE; -+ argp->discrete = dev->capture_param.timeperframe; -+ } else { -+ if (argp->width < dev->min_width || -+ argp->width > dev->max_width || -+ argp->height < dev->min_height || -+ argp->height > dev->max_height || -+ NULL == format_by_fourcc(argp->pixel_format)) -+ return -EINVAL; -+ -+ argp->type = V4L2_FRMIVAL_TYPE_CONTINUOUS; -+ argp->stepwise.min.numerator = 1; -+ argp->stepwise.min.denominator = V4L2LOOPBACK_FPS_MAX; -+ argp->stepwise.max.numerator = 1; -+ argp->stepwise.max.denominator = V4L2LOOPBACK_FPS_MIN; -+ argp->stepwise.step.numerator = 1; -+ argp->stepwise.step.denominator = 1; -+ } -+ -+ return 0; -+} -+ -+/* ------------------ CAPTURE ----------------------- */ -+ -+/* returns device formats -+ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE -+ */ -+static int vidioc_enum_fmt_cap(struct file *file, void *fh, -+ struct v4l2_fmtdesc *f) -+{ -+ struct v4l2_loopback_device *dev; -+ const struct v4l2l_format *fmt; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ if (f->index) -+ return -EINVAL; -+ -+ if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) { -+ /* format has been fixed, so only one single format is supported */ -+ const __u32 format = dev->pix_format.pixelformat; -+ -+ if ((fmt = format_by_fourcc(format))) { -+ snprintf(f->description, sizeof(f->description), "%s", -+ fmt->name); -+ } else { -+ snprintf(f->description, sizeof(f->description), -+ "[%c%c%c%c]", (format >> 0) & 0xFF, -+ (format >> 8) & 0xFF, (format >> 16) & 0xFF, -+ (format >> 24) & 0xFF); -+ } -+ -+ f->pixelformat = dev->pix_format.pixelformat; -+ } else { -+ return -EINVAL; -+ } -+ f->flags = 0; -+ MARK(); -+ return 0; -+} -+ -+/* returns current video format -+ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE -+ */ -+static int vidioc_g_fmt_cap(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ struct v4l2_loopback_device *dev; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ if (!dev->ready_for_capture && !dev->ready_for_output) -+ return -EINVAL; -+ -+ fmt->fmt.pix = dev->pix_format; -+ MARK(); -+ return 0; -+} -+ -+/* checks if it is OK to change to format fmt; -+ * actual check is done by inner_try_setfmt -+ * just checking that pixelformat is OK and set other parameters, app should -+ * obey this decision -+ * called on VIDIOC_TRY_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE -+ */ -+static int vidioc_try_fmt_cap(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ int ret = 0; -+ if (!V4L2_TYPE_IS_CAPTURE(fmt->type)) -+ return -EINVAL; -+ ret = inner_try_setfmt(file, fmt); -+ if (-EBUSY == ret) -+ return 0; -+ return ret; -+} -+ -+/* sets new output format, if possible -+ * actually format is set by input and we even do not check it, just return -+ * current one, but it is possible to set subregions of input TODO(vasaka) -+ * called on VIDIOC_S_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE -+ */ -+static int vidioc_s_fmt_cap(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ int ret; -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ if (!V4L2_TYPE_IS_CAPTURE(fmt->type)) -+ return -EINVAL; -+ ret = inner_try_setfmt(file, fmt); -+ if (!ret) { -+ dev->pix_format = fmt->fmt.pix; -+ } -+ return ret; -+} -+ -+/* ------------------ OUTPUT ----------------------- */ -+ -+/* returns device formats; -+ * LATER: allow all formats -+ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT -+ */ -+static int vidioc_enum_fmt_out(struct file *file, void *fh, -+ struct v4l2_fmtdesc *f) -+{ -+ struct v4l2_loopback_device *dev; -+ const struct v4l2l_format *fmt; -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) { -+ /* format has been fixed, so only one single format is supported */ -+ const __u32 format = dev->pix_format.pixelformat; -+ -+ if (f->index) -+ return -EINVAL; -+ -+ if ((fmt = format_by_fourcc(format))) { -+ snprintf(f->description, sizeof(f->description), "%s", -+ fmt->name); -+ } else { -+ snprintf(f->description, sizeof(f->description), -+ "[%c%c%c%c]", (format >> 0) & 0xFF, -+ (format >> 8) & 0xFF, (format >> 16) & 0xFF, -+ (format >> 24) & 0xFF); -+ } -+ -+ f->pixelformat = dev->pix_format.pixelformat; -+ } else { -+ /* fill in a dummy format */ -+ /* coverity[unsigned_compare] */ -+ if (f->index < 0 || f->index >= FORMATS) -+ return -EINVAL; -+ -+ fmt = &formats[f->index]; -+ -+ f->pixelformat = fmt->fourcc; -+ snprintf(f->description, sizeof(f->description), "%s", -+ fmt->name); -+ } -+ f->flags = 0; -+ -+ return 0; -+} -+ -+/* returns current video format format fmt */ -+/* NOTE: this is called from the producer -+ * so if format has not been negotiated yet, -+ * it should return ALL of available formats, -+ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT -+ */ -+static int vidioc_g_fmt_out(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ struct v4l2_loopback_device *dev; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ /* -+ * LATER: this should return the currently valid format -+ * gstreamer doesn't like it, if this returns -EINVAL, as it -+ * then concludes that there is _no_ valid format -+ * CHECK whether this assumption is wrong, -+ * or whether we have to always provide a valid format -+ */ -+ -+ fmt->fmt.pix = dev->pix_format; -+ return 0; -+} -+ -+/* checks if it is OK to change to format fmt; -+ * if format is negotiated do not change it -+ * called on VIDIOC_TRY_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT -+ */ -+static int vidioc_try_fmt_out(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ int ret = 0; -+ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) -+ return -EINVAL; -+ ret = inner_try_setfmt(file, fmt); -+ if (-EBUSY == ret) -+ return 0; -+ return ret; -+} -+ -+/* sets new output format, if possible; -+ * allocate data here because we do not know if it will be streaming or -+ * read/write IO -+ * called on VIDIOC_S_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT -+ */ -+static int vidioc_s_fmt_out(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ struct v4l2_loopback_device *dev; -+ int ret; -+ char buf[5]; -+ buf[4] = 0; -+ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) -+ return -EINVAL; -+ dev = v4l2loopback_getdevice(file); -+ -+ ret = inner_try_setfmt(file, fmt); -+ if (!ret) { -+ dev->pix_format = fmt->fmt.pix; -+ dev->pix_format_has_valid_sizeimage = -+ v4l2l_pix_format_has_valid_sizeimage(fmt); -+ dprintk("s_fmt_out(%d) %d...%d\n", ret, dev->ready_for_capture, -+ dev->pix_format.sizeimage); -+ dprintk("outFOURCC=%s\n", -+ fourcc2str(dev->pix_format.pixelformat, buf)); -+ -+ if (!dev->ready_for_capture) { -+ dev->buffer_size = -+ PAGE_ALIGN(dev->pix_format.sizeimage); -+ // JMZ: TODO get rid of the next line -+ fmt->fmt.pix.sizeimage = dev->buffer_size; -+ ret = allocate_buffers(dev); -+ } -+ } -+ return ret; -+} -+ -+// #define V4L2L_OVERLAY -+#ifdef V4L2L_OVERLAY -+/* ------------------ OVERLAY ----------------------- */ -+/* currently unsupported */ -+/* GSTreamer's v4l2sink is buggy, as it requires the overlay to work -+ * while it should only require it, if overlay is requested -+ * once the gstreamer element is fixed, remove the overlay dummies -+ */ -+#warning OVERLAY dummies -+static int vidioc_g_fmt_overlay(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ return 0; -+} -+ -+static int vidioc_s_fmt_overlay(struct file *file, void *priv, -+ struct v4l2_format *fmt) -+{ -+ return 0; -+} -+#endif /* V4L2L_OVERLAY */ -+ -+/* ------------------ PARAMs ----------------------- */ -+ -+/* get some data flow parameters, only capability, fps and readbuffers has -+ * effect on this driver -+ * called on VIDIOC_G_PARM -+ */ -+static int vidioc_g_parm(struct file *file, void *priv, -+ struct v4l2_streamparm *parm) -+{ -+ /* do not care about type of opener, hope these enums would always be -+ * compatible */ -+ struct v4l2_loopback_device *dev; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ parm->parm.capture = dev->capture_param; -+ return 0; -+} -+ -+/* get some data flow parameters, only capability, fps and readbuffers has -+ * effect on this driver -+ * called on VIDIOC_S_PARM -+ */ -+static int vidioc_s_parm(struct file *file, void *priv, -+ struct v4l2_streamparm *parm) -+{ -+ struct v4l2_loopback_device *dev; -+ int err = 0; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ dprintk("vidioc_s_parm called frate=%d/%d\n", -+ parm->parm.capture.timeperframe.numerator, -+ parm->parm.capture.timeperframe.denominator); -+ -+ switch (parm->type) { -+ case V4L2_BUF_TYPE_VIDEO_CAPTURE: -+ if ((err = set_timeperframe( -+ dev, &parm->parm.capture.timeperframe)) < 0) -+ return err; -+ break; -+ case V4L2_BUF_TYPE_VIDEO_OUTPUT: -+ if ((err = set_timeperframe( -+ dev, &parm->parm.capture.timeperframe)) < 0) -+ return err; -+ break; -+ default: -+ return -1; -+ } -+ -+ parm->parm.capture = dev->capture_param; -+ return 0; -+} -+ -+#ifdef V4L2LOOPBACK_WITH_STD -+/* sets a tv standard, actually we do not need to handle this any special way -+ * added to support effecttv -+ * called on VIDIOC_S_STD -+ */ -+static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id *_std) -+{ -+ v4l2_std_id req_std = 0, supported_std = 0; -+ const v4l2_std_id all_std = V4L2_STD_ALL, no_std = 0; -+ -+ if (_std) { -+ req_std = *_std; -+ *_std = all_std; -+ } -+ -+ /* we support everything in V4L2_STD_ALL, but not more... */ -+ supported_std = (all_std & req_std); -+ if (no_std == supported_std) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+/* gets a fake video standard -+ * called on VIDIOC_G_STD -+ */ -+static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm) -+{ -+ if (norm) -+ *norm = V4L2_STD_ALL; -+ return 0; -+} -+/* gets a fake video standard -+ * called on VIDIOC_QUERYSTD -+ */ -+static int vidioc_querystd(struct file *file, void *fh, v4l2_std_id *norm) -+{ -+ if (norm) -+ *norm = V4L2_STD_ALL; -+ return 0; -+} -+#endif /* V4L2LOOPBACK_WITH_STD */ -+ -+static int v4l2loopback_set_ctrl(struct v4l2_loopback_device *dev, u32 id, -+ s64 val) -+{ -+ switch (id) { -+ case CID_KEEP_FORMAT: -+ if (val < 0 || val > 1) -+ return -EINVAL; -+ dev->keep_format = val; -+ try_free_buffers( -+ dev); /* will only free buffers if !keep_format */ -+ break; -+ case CID_SUSTAIN_FRAMERATE: -+ if (val < 0 || val > 1) -+ return -EINVAL; -+ spin_lock_bh(&dev->lock); -+ dev->sustain_framerate = val; -+ check_timers(dev); -+ spin_unlock_bh(&dev->lock); -+ break; -+ case CID_TIMEOUT: -+ if (val < 0 || val > MAX_TIMEOUT) -+ return -EINVAL; -+ spin_lock_bh(&dev->lock); -+ dev->timeout_jiffies = msecs_to_jiffies(val); -+ check_timers(dev); -+ spin_unlock_bh(&dev->lock); -+ allocate_timeout_image(dev); -+ break; -+ case CID_TIMEOUT_IMAGE_IO: -+ dev->timeout_image_io = 1; -+ break; -+ default: -+ return -EINVAL; -+ } -+ return 0; -+} -+ -+static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl) -+{ -+ struct v4l2_loopback_device *dev = container_of( -+ ctrl->handler, struct v4l2_loopback_device, ctrl_handler); -+ return v4l2loopback_set_ctrl(dev, ctrl->id, ctrl->val); -+} -+ -+/* returns set of device outputs, in our case there is only one -+ * called on VIDIOC_ENUMOUTPUT -+ */ -+static int vidioc_enum_output(struct file *file, void *fh, -+ struct v4l2_output *outp) -+{ -+ __u32 index = outp->index; -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ MARK(); -+ -+ if (!dev->announce_all_caps && !dev->ready_for_output) -+ return -ENOTTY; -+ -+ if (0 != index) -+ return -EINVAL; -+ -+ /* clear all data (including the reserved fields) */ -+ memset(outp, 0, sizeof(*outp)); -+ -+ outp->index = index; -+ strscpy(outp->name, "loopback in", sizeof(outp->name)); -+ outp->type = V4L2_OUTPUT_TYPE_ANALOG; -+ outp->audioset = 0; -+ outp->modulator = 0; -+#ifdef V4L2LOOPBACK_WITH_STD -+ outp->std = V4L2_STD_ALL; -+#ifdef V4L2_OUT_CAP_STD -+ outp->capabilities |= V4L2_OUT_CAP_STD; -+#endif /* V4L2_OUT_CAP_STD */ -+#endif /* V4L2LOOPBACK_WITH_STD */ -+ -+ return 0; -+} -+ -+/* which output is currently active, -+ * called on VIDIOC_G_OUTPUT -+ */ -+static int vidioc_g_output(struct file *file, void *fh, unsigned int *i) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ if (!dev->announce_all_caps && !dev->ready_for_output) -+ return -ENOTTY; -+ if (i) -+ *i = 0; -+ return 0; -+} -+ -+/* set output, can make sense if we have more than one video src, -+ * called on VIDIOC_S_OUTPUT -+ */ -+static int vidioc_s_output(struct file *file, void *fh, unsigned int i) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ if (!dev->announce_all_caps && !dev->ready_for_output) -+ return -ENOTTY; -+ -+ if (i) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+/* returns set of device inputs, in our case there is only one, -+ * but later I may add more -+ * called on VIDIOC_ENUMINPUT -+ */ -+static int vidioc_enum_input(struct file *file, void *fh, -+ struct v4l2_input *inp) -+{ -+ struct v4l2_loopback_device *dev; -+ __u32 index = inp->index; -+ MARK(); -+ -+ if (0 != index) -+ return -EINVAL; -+ -+ /* clear all data (including the reserved fields) */ -+ memset(inp, 0, sizeof(*inp)); -+ -+ inp->index = index; -+ strscpy(inp->name, "loopback", sizeof(inp->name)); -+ inp->type = V4L2_INPUT_TYPE_CAMERA; -+ inp->audioset = 0; -+ inp->tuner = 0; -+ inp->status = 0; -+ -+#ifdef V4L2LOOPBACK_WITH_STD -+ inp->std = V4L2_STD_ALL; -+#ifdef V4L2_IN_CAP_STD -+ inp->capabilities |= V4L2_IN_CAP_STD; -+#endif -+#endif /* V4L2LOOPBACK_WITH_STD */ -+ -+ dev = v4l2loopback_getdevice(file); -+ if (!dev->ready_for_capture) { -+ inp->status |= V4L2_IN_ST_NO_SIGNAL; -+ } -+ -+ return 0; -+} -+ -+/* which input is currently active, -+ * called on VIDIOC_G_INPUT -+ */ -+static int vidioc_g_input(struct file *file, void *fh, unsigned int *i) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ if (!dev->announce_all_caps && !dev->ready_for_capture) -+ return -ENOTTY; -+ if (i) -+ *i = 0; -+ return 0; -+} -+ -+/* set input, can make sense if we have more than one video src, -+ * called on VIDIOC_S_INPUT -+ */ -+static int vidioc_s_input(struct file *file, void *fh, unsigned int i) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ if (!dev->announce_all_caps && !dev->ready_for_capture) -+ return -ENOTTY; -+ if (i == 0) -+ return 0; -+ return -EINVAL; -+} -+ -+/* --------------- V4L2 ioctl buffer related calls ----------------- */ -+ -+/* negotiate buffer type -+ * only mmap streaming supported -+ * called on VIDIOC_REQBUFS -+ */ -+static int vidioc_reqbufs(struct file *file, void *fh, -+ struct v4l2_requestbuffers *b) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ int i; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ -+ dprintk("reqbufs: %d\t%d=%d\n", b->memory, b->count, -+ dev->buffers_number); -+ -+ if (opener->timeout_image_io) { -+ dev->timeout_image_io = 0; -+ if (b->memory != V4L2_MEMORY_MMAP) -+ return -EINVAL; -+ b->count = 2; -+ return 0; -+ } -+ -+ if (V4L2_TYPE_IS_OUTPUT(b->type) && (!dev->ready_for_output)) { -+ return -EBUSY; -+ } -+ -+ init_buffers(dev); -+ switch (b->memory) { -+ case V4L2_MEMORY_MMAP: -+ /* do nothing here, buffers are always allocated */ -+ if (b->count < 1 || dev->buffers_number < 1) -+ return 0; -+ -+ if (b->count > dev->buffers_number) -+ b->count = dev->buffers_number; -+ -+ /* make sure that outbufs_list contains buffers from 0 to used_buffers-1 -+ * actually, it will have been already populated via v4l2_loopback_init() -+ * at this point */ -+ if (list_empty(&dev->outbufs_list)) { -+ for (i = 0; i < dev->used_buffers; ++i) -+ list_add_tail(&dev->buffers[i].list_head, -+ &dev->outbufs_list); -+ } -+ -+ /* also, if dev->used_buffers is going to be decreased, we should remove -+ * out-of-range buffers from outbufs_list, and fix bufpos2index mapping */ -+ if (b->count < dev->used_buffers) { -+ struct v4l2l_buffer *pos, *n; -+ -+ list_for_each_entry_safe(pos, n, &dev->outbufs_list, -+ list_head) { -+ if (pos->buffer.index >= b->count) -+ list_del(&pos->list_head); -+ } -+ -+ /* after we update dev->used_buffers, buffers in outbufs_list will -+ * correspond to dev->write_position + [0;b->count-1] range */ -+ i = dev->write_position % b->count; -+ list_for_each_entry(pos, &dev->outbufs_list, -+ list_head) { -+ dev->bufpos2index[i % b->count] = -+ pos->buffer.index; -+ ++i; -+ } -+ } -+ -+ opener->buffers_number = b->count; -+ if (opener->buffers_number < dev->used_buffers) -+ dev->used_buffers = opener->buffers_number; -+ return 0; -+ default: -+ return -EINVAL; -+ } -+} -+ -+/* returns buffer asked for; -+ * give app as many buffers as it wants, if it less than MAX, -+ * but map them in our inner buffers -+ * called on VIDIOC_QUERYBUF -+ */ -+static int vidioc_querybuf(struct file *file, void *fh, struct v4l2_buffer *b) -+{ -+ enum v4l2_buf_type type; -+ int index; -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ -+ MARK(); -+ -+ type = b->type; -+ index = b->index; -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ -+ if ((b->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) && -+ (b->type != V4L2_BUF_TYPE_VIDEO_OUTPUT)) { -+ return -EINVAL; -+ } -+ if (b->index > max_buffers) -+ return -EINVAL; -+ -+ if (opener->timeout_image_io) -+ *b = dev->timeout_image_buffer.buffer; -+ else -+ *b = dev->buffers[b->index % dev->used_buffers].buffer; -+ -+ b->type = type; -+ b->index = index; -+ dprintkrw("buffer type: %d (of %d with size=%ld)\n", b->memory, -+ dev->buffers_number, dev->buffer_size); -+ -+ /* Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture' -+ https://github.com/umlaeute/v4l2loopback/issues/60 */ -+ b->flags &= ~V4L2_BUF_FLAG_DONE; -+ b->flags |= V4L2_BUF_FLAG_QUEUED; -+ -+ return 0; -+} -+ -+static void buffer_written(struct v4l2_loopback_device *dev, -+ struct v4l2l_buffer *buf) -+{ -+ del_timer_sync(&dev->sustain_timer); -+ del_timer_sync(&dev->timeout_timer); -+ -+ spin_lock_bh(&dev->list_lock); -+ list_move_tail(&buf->list_head, &dev->outbufs_list); -+ spin_unlock_bh(&dev->list_lock); -+ -+ spin_lock_bh(&dev->lock); -+ dev->bufpos2index[dev->write_position % dev->used_buffers] = -+ buf->buffer.index; -+ ++dev->write_position; -+ dev->reread_count = 0; -+ -+ check_timers(dev); -+ spin_unlock_bh(&dev->lock); -+} -+ -+/* put buffer to queue -+ * called on VIDIOC_QBUF -+ */ -+static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ struct v4l2l_buffer *b; -+ int index; -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ -+ if (buf->index > max_buffers) -+ return -EINVAL; -+ if (opener->timeout_image_io) -+ return 0; -+ -+ index = buf->index % dev->used_buffers; -+ b = &dev->buffers[index]; -+ -+ switch (buf->type) { -+ case V4L2_BUF_TYPE_VIDEO_CAPTURE: -+ dprintkrw( -+ "qbuf(CAPTURE)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n", -+ index, buf->index, buf, buf->type, buf->bytesused, -+ buf->length, buf->flags, buf->field, -+ (long long)buf->timestamp.tv_sec, -+ (long int)buf->timestamp.tv_usec, buf->sequence); -+ set_queued(b); -+ return 0; -+ case V4L2_BUF_TYPE_VIDEO_OUTPUT: -+ dprintkrw( -+ "qbuf(OUTPUT)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n", -+ index, buf->index, buf, buf->type, buf->bytesused, -+ buf->length, buf->flags, buf->field, -+ (long long)buf->timestamp.tv_sec, -+ (long int)buf->timestamp.tv_usec, buf->sequence); -+ if ((!(b->buffer.flags & V4L2_BUF_FLAG_TIMESTAMP_COPY)) && -+ (buf->timestamp.tv_sec == 0 && buf->timestamp.tv_usec == 0)) -+ v4l2l_get_timestamp(&b->buffer); -+ else { -+ b->buffer.timestamp = buf->timestamp; -+ b->buffer.flags |= V4L2_BUF_FLAG_TIMESTAMP_COPY; -+ } -+ if (dev->pix_format_has_valid_sizeimage) { -+ if (buf->bytesused >= dev->pix_format.sizeimage) { -+ b->buffer.bytesused = dev->pix_format.sizeimage; -+ } else { -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) -+ dev_warn_ratelimited( -+ &dev->vdev->dev, -+#else -+ dprintkrw( -+#endif -+ "warning queued output buffer bytesused too small %d < %d\n", -+ buf->bytesused, -+ dev->pix_format.sizeimage); -+ b->buffer.bytesused = buf->bytesused; -+ } -+ } else { -+ b->buffer.bytesused = buf->bytesused; -+ } -+ -+ set_done(b); -+ buffer_written(dev, b); -+ -+ /* Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture' -+ https://github.com/umlaeute/v4l2loopback/issues/60 */ -+ buf->flags &= ~V4L2_BUF_FLAG_DONE; -+ buf->flags |= V4L2_BUF_FLAG_QUEUED; -+ -+ wake_up_all(&dev->read_event); -+ return 0; -+ default: -+ return -EINVAL; -+ } -+} -+ -+static int can_read(struct v4l2_loopback_device *dev, -+ struct v4l2_loopback_opener *opener) -+{ -+ int ret; -+ -+ spin_lock_bh(&dev->lock); -+ check_timers(dev); -+ ret = dev->write_position > opener->read_position || -+ dev->reread_count > opener->reread_count || dev->timeout_happened; -+ spin_unlock_bh(&dev->lock); -+ return ret; -+} -+ -+static int get_capture_buffer(struct file *file) -+{ -+ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); -+ struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data); -+ int pos, ret; -+ int timeout_happened; -+ -+ if ((file->f_flags & O_NONBLOCK) && -+ (dev->write_position <= opener->read_position && -+ dev->reread_count <= opener->reread_count && -+ !dev->timeout_happened)) -+ return -EAGAIN; -+ wait_event_interruptible(dev->read_event, can_read(dev, opener)); -+ -+ spin_lock_bh(&dev->lock); -+ if (dev->write_position == opener->read_position) { -+ if (dev->reread_count > opener->reread_count + 2) -+ opener->reread_count = dev->reread_count - 1; -+ ++opener->reread_count; -+ pos = (opener->read_position + dev->used_buffers - 1) % -+ dev->used_buffers; -+ } else { -+ opener->reread_count = 0; -+ if (dev->write_position > -+ opener->read_position + dev->used_buffers) -+ opener->read_position = dev->write_position - 1; -+ pos = opener->read_position % dev->used_buffers; -+ ++opener->read_position; -+ } -+ timeout_happened = dev->timeout_happened; -+ dev->timeout_happened = 0; -+ spin_unlock_bh(&dev->lock); -+ -+ ret = dev->bufpos2index[pos]; -+ if (timeout_happened) { -+ if (ret < 0) { -+ dprintk("trying to return not mapped buf[%d]\n", ret); -+ return -EFAULT; -+ } -+ /* although allocated on-demand, timeout_image is freed only -+ * in free_buffers(), so we don't need to worry about it being -+ * deallocated suddenly */ -+ memcpy(dev->image + dev->buffers[ret].buffer.m.offset, -+ dev->timeout_image, dev->buffer_size); -+ } -+ return ret; -+} -+ -+/* put buffer to dequeue -+ * called on VIDIOC_DQBUF -+ */ -+static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ int index; -+ struct v4l2l_buffer *b; -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ if (opener->timeout_image_io) { -+ *buf = dev->timeout_image_buffer.buffer; -+ return 0; -+ } -+ -+ switch (buf->type) { -+ case V4L2_BUF_TYPE_VIDEO_CAPTURE: -+ index = get_capture_buffer(file); -+ if (index < 0) -+ return index; -+ dprintkrw("capture DQBUF pos: %lld index: %d\n", -+ (long long)(opener->read_position - 1), index); -+ if (!(dev->buffers[index].buffer.flags & -+ V4L2_BUF_FLAG_MAPPED)) { -+ dprintk("trying to return not mapped buf[%d]\n", index); -+ return -EINVAL; -+ } -+ unset_flags(&dev->buffers[index]); -+ *buf = dev->buffers[index].buffer; -+ dprintkrw( -+ "dqbuf(CAPTURE)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n", -+ index, buf->index, buf, buf->type, buf->bytesused, -+ buf->length, buf->flags, buf->field, -+ (long long)buf->timestamp.tv_sec, -+ (long int)buf->timestamp.tv_usec, buf->sequence); -+ return 0; -+ case V4L2_BUF_TYPE_VIDEO_OUTPUT: -+ spin_lock_bh(&dev->list_lock); -+ -+ b = list_entry(dev->outbufs_list.prev, struct v4l2l_buffer, -+ list_head); -+ list_move_tail(&b->list_head, &dev->outbufs_list); -+ -+ spin_unlock_bh(&dev->list_lock); -+ dprintkrw("output DQBUF index: %d\n", b->buffer.index); -+ unset_flags(b); -+ *buf = b->buffer; -+ buf->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; -+ dprintkrw( -+ "dqbuf(OUTPUT)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n", -+ index, buf->index, buf, buf->type, buf->bytesused, -+ buf->length, buf->flags, buf->field, -+ (long long)buf->timestamp.tv_sec, -+ (long int)buf->timestamp.tv_usec, buf->sequence); -+ return 0; -+ default: -+ return -EINVAL; -+ } -+} -+ -+/* ------------- STREAMING ------------------- */ -+ -+/* start streaming -+ * called on VIDIOC_STREAMON -+ */ -+static int vidioc_streamon(struct file *file, void *fh, enum v4l2_buf_type type) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ -+ switch (type) { -+ case V4L2_BUF_TYPE_VIDEO_OUTPUT: -+ if (!dev->ready_for_capture) { -+ int ret = allocate_buffers(dev); -+ if (ret < 0) -+ return ret; -+ } -+ opener->type = WRITER; -+ dev->ready_for_output = 0; -+ dev->ready_for_capture++; -+ return 0; -+ case V4L2_BUF_TYPE_VIDEO_CAPTURE: -+ if (!dev->ready_for_capture) -+ return -EIO; -+ if (dev->active_readers > 0) -+ return -EBUSY; -+ opener->type = READER; -+ dev->active_readers++; -+ client_usage_queue_event(dev->vdev); -+ return 0; -+ default: -+ return -EINVAL; -+ } -+ return -EINVAL; -+} -+ -+/* stop streaming -+ * called on VIDIOC_STREAMOFF -+ */ -+static int vidioc_streamoff(struct file *file, void *fh, -+ enum v4l2_buf_type type) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ -+ MARK(); -+ dprintk("%d\n", type); -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(fh); -+ switch (type) { -+ case V4L2_BUF_TYPE_VIDEO_OUTPUT: -+ if (dev->ready_for_capture > 0) -+ dev->ready_for_capture--; -+ return 0; -+ case V4L2_BUF_TYPE_VIDEO_CAPTURE: -+ if (opener->type == READER) { -+ opener->type = 0; -+ dev->active_readers--; -+ client_usage_queue_event(dev->vdev); -+ } -+ return 0; -+ default: -+ return -EINVAL; -+ } -+ return -EINVAL; -+} -+ -+#ifdef CONFIG_VIDEO_V4L1_COMPAT -+static int vidiocgmbuf(struct file *file, void *fh, struct video_mbuf *p) -+{ -+ struct v4l2_loopback_device *dev; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ p->frames = dev->buffers_number; -+ p->offsets[0] = 0; -+ p->offsets[1] = 0; -+ p->size = dev->buffer_size; -+ return 0; -+} -+#endif -+ -+static void client_usage_queue_event(struct video_device *vdev) -+{ -+ struct v4l2_event ev; -+ struct v4l2_loopback_device *dev; -+ -+ dev = container_of(vdev->v4l2_dev, struct v4l2_loopback_device, -+ v4l2_dev); -+ -+ memset(&ev, 0, sizeof(ev)); -+ ev.type = V4L2_EVENT_PRI_CLIENT_USAGE; -+ ((struct v4l2_event_client_usage *)&ev.u)->count = dev->active_readers; -+ -+ v4l2_event_queue(vdev, &ev); -+} -+ -+static int client_usage_ops_add(struct v4l2_subscribed_event *sev, -+ unsigned elems) -+{ -+ if (!(sev->flags & V4L2_EVENT_SUB_FL_SEND_INITIAL)) -+ return 0; -+ -+ client_usage_queue_event(sev->fh->vdev); -+ return 0; -+} -+ -+static void client_usage_ops_replace(struct v4l2_event *old, -+ const struct v4l2_event *new) -+{ -+ *((struct v4l2_event_client_usage *)&old->u) = -+ *((struct v4l2_event_client_usage *)&new->u); -+} -+ -+static void client_usage_ops_merge(const struct v4l2_event *old, -+ struct v4l2_event *new) -+{ -+ *((struct v4l2_event_client_usage *)&new->u) = -+ *((struct v4l2_event_client_usage *)&old->u); -+} -+ -+const struct v4l2_subscribed_event_ops client_usage_ops = { -+ .add = client_usage_ops_add, -+ .replace = client_usage_ops_replace, -+ .merge = client_usage_ops_merge, -+}; -+ -+static int vidioc_subscribe_event(struct v4l2_fh *fh, -+ const struct v4l2_event_subscription *sub) -+{ -+ switch (sub->type) { -+ case V4L2_EVENT_CTRL: -+ return v4l2_ctrl_subscribe_event(fh, sub); -+ case V4L2_EVENT_PRI_CLIENT_USAGE: -+ return v4l2_event_subscribe(fh, sub, 0, &client_usage_ops); -+ } -+ -+ return -EINVAL; -+} -+ -+/* file operations */ -+static void vm_open(struct vm_area_struct *vma) -+{ -+ struct v4l2l_buffer *buf; -+ MARK(); -+ -+ buf = vma->vm_private_data; -+ buf->use_count++; -+ -+ buf->buffer.flags |= V4L2_BUF_FLAG_MAPPED; -+} -+ -+static void vm_close(struct vm_area_struct *vma) -+{ -+ struct v4l2l_buffer *buf; -+ MARK(); -+ -+ buf = vma->vm_private_data; -+ buf->use_count--; -+ -+ if (buf->use_count <= 0) -+ buf->buffer.flags &= ~V4L2_BUF_FLAG_MAPPED; -+} -+ -+static struct vm_operations_struct vm_ops = { -+ .open = vm_open, -+ .close = vm_close, -+}; -+ -+static int v4l2_loopback_mmap(struct file *file, struct vm_area_struct *vma) -+{ -+ u8 *addr; -+ unsigned long start; -+ unsigned long size; -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ struct v4l2l_buffer *buffer = NULL; -+ MARK(); -+ -+ start = (unsigned long)vma->vm_start; -+ size = (unsigned long)(vma->vm_end - vma->vm_start); -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(file->private_data); -+ -+ if (size > dev->buffer_size) { -+ dprintk("userspace tries to mmap too much, fail\n"); -+ return -EINVAL; -+ } -+ if (opener->timeout_image_io) { -+ /* we are going to map the timeout_image_buffer */ -+ if ((vma->vm_pgoff << PAGE_SHIFT) != -+ dev->buffer_size * MAX_BUFFERS) { -+ dprintk("invalid mmap offset for timeout_image_io mode\n"); -+ return -EINVAL; -+ } -+ } else if ((vma->vm_pgoff << PAGE_SHIFT) > -+ dev->buffer_size * (dev->buffers_number - 1)) { -+ dprintk("userspace tries to mmap too far, fail\n"); -+ return -EINVAL; -+ } -+ -+ /* FIXXXXXME: allocation should not happen here! */ -+ if (NULL == dev->image) -+ if (allocate_buffers(dev) < 0) -+ return -EINVAL; -+ -+ if (opener->timeout_image_io) { -+ buffer = &dev->timeout_image_buffer; -+ addr = dev->timeout_image; -+ } else { -+ int i; -+ for (i = 0; i < dev->buffers_number; ++i) { -+ buffer = &dev->buffers[i]; -+ if ((buffer->buffer.m.offset >> PAGE_SHIFT) == -+ vma->vm_pgoff) -+ break; -+ } -+ -+ if (i >= dev->buffers_number) -+ return -EINVAL; -+ -+ addr = dev->image + (vma->vm_pgoff << PAGE_SHIFT); -+ } -+ -+ while (size > 0) { -+ struct page *page; -+ -+ page = vmalloc_to_page(addr); -+ -+ if (vm_insert_page(vma, start, page) < 0) -+ return -EAGAIN; -+ -+ start += PAGE_SIZE; -+ addr += PAGE_SIZE; -+ size -= PAGE_SIZE; -+ } -+ -+ vma->vm_ops = &vm_ops; -+ vma->vm_private_data = buffer; -+ -+ vm_open(vma); -+ -+ MARK(); -+ return 0; -+} -+ -+static unsigned int v4l2_loopback_poll(struct file *file, -+ struct poll_table_struct *pts) -+{ -+ struct v4l2_loopback_opener *opener; -+ struct v4l2_loopback_device *dev; -+ __poll_t req_events = poll_requested_events(pts); -+ int ret_mask = 0; -+ MARK(); -+ -+ opener = fh_to_opener(file->private_data); -+ dev = v4l2loopback_getdevice(file); -+ -+ if (req_events & POLLPRI) { -+ if (!v4l2_event_pending(&opener->fh)) -+ poll_wait(file, &opener->fh.wait, pts); -+ if (v4l2_event_pending(&opener->fh)) { -+ ret_mask |= POLLPRI; -+ if (!(req_events & DEFAULT_POLLMASK)) -+ return ret_mask; -+ } -+ } -+ -+ switch (opener->type) { -+ case WRITER: -+ ret_mask |= POLLOUT | POLLWRNORM; -+ break; -+ case READER: -+ if (!can_read(dev, opener)) { -+ if (ret_mask) -+ return ret_mask; -+ poll_wait(file, &dev->read_event, pts); -+ } -+ if (can_read(dev, opener)) -+ ret_mask |= POLLIN | POLLRDNORM; -+ if (v4l2_event_pending(&opener->fh)) -+ ret_mask |= POLLPRI; -+ break; -+ default: -+ break; -+ } -+ -+ MARK(); -+ return ret_mask; -+} -+ -+/* do not want to limit device opens, it can be as many readers as user want, -+ * writers are limited by means of setting writer field */ -+static int v4l2_loopback_open(struct file *file) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_opener *opener; -+ MARK(); -+ dev = v4l2loopback_getdevice(file); -+ if (dev->open_count.counter >= dev->max_openers) -+ return -EBUSY; -+ /* kfree on close */ -+ opener = kzalloc(sizeof(*opener), GFP_KERNEL); -+ if (opener == NULL) -+ return -ENOMEM; -+ -+ atomic_inc(&dev->open_count); -+ -+ opener->timeout_image_io = dev->timeout_image_io; -+ if (opener->timeout_image_io) { -+ int r = allocate_timeout_image(dev); -+ -+ if (r < 0) { -+ dprintk("timeout image allocation failed\n"); -+ -+ atomic_dec(&dev->open_count); -+ -+ kfree(opener); -+ return r; -+ } -+ } -+ -+ v4l2_fh_init(&opener->fh, video_devdata(file)); -+ file->private_data = &opener->fh; -+ -+ v4l2_fh_add(&opener->fh); -+ dprintk("opened dev:%p with image:%p\n", dev, dev ? dev->image : NULL); -+ MARK(); -+ return 0; -+} -+ -+static int v4l2_loopback_close(struct file *file) -+{ -+ struct v4l2_loopback_opener *opener; -+ struct v4l2_loopback_device *dev; -+ int is_writer = 0, is_reader = 0; -+ MARK(); -+ -+ opener = fh_to_opener(file->private_data); -+ dev = v4l2loopback_getdevice(file); -+ -+ if (WRITER == opener->type) -+ is_writer = 1; -+ if (READER == opener->type) -+ is_reader = 1; -+ -+ atomic_dec(&dev->open_count); -+ if (dev->open_count.counter == 0) { -+ del_timer_sync(&dev->sustain_timer); -+ del_timer_sync(&dev->timeout_timer); -+ } -+ try_free_buffers(dev); -+ -+ v4l2_fh_del(&opener->fh); -+ v4l2_fh_exit(&opener->fh); -+ -+ kfree(opener); -+ if (is_writer) -+ dev->ready_for_output = 1; -+ if (is_reader) { -+ dev->active_readers--; -+ client_usage_queue_event(dev->vdev); -+ } -+ MARK(); -+ return 0; -+} -+ -+static ssize_t v4l2_loopback_read(struct file *file, char __user *buf, -+ size_t count, loff_t *ppos) -+{ -+ int read_index; -+ struct v4l2_loopback_device *dev; -+ struct v4l2_buffer *b; -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ -+ read_index = get_capture_buffer(file); -+ if (read_index < 0) -+ return read_index; -+ if (count > dev->buffer_size) -+ count = dev->buffer_size; -+ b = &dev->buffers[read_index].buffer; -+ if (count > b->bytesused) -+ count = b->bytesused; -+ if (copy_to_user((void *)buf, (void *)(dev->image + b->m.offset), -+ count)) { -+ printk(KERN_ERR -+ "v4l2-loopback: failed copy_to_user() in read buf\n"); -+ return -EFAULT; -+ } -+ dprintkrw("leave v4l2_loopback_read()\n"); -+ return count; -+} -+ -+static ssize_t v4l2_loopback_write(struct file *file, const char __user *buf, -+ size_t count, loff_t *ppos) -+{ -+ struct v4l2_loopback_opener *opener; -+ struct v4l2_loopback_device *dev; -+ int write_index; -+ struct v4l2_buffer *b; -+ int err = 0; -+ -+ MARK(); -+ -+ dev = v4l2loopback_getdevice(file); -+ opener = fh_to_opener(file->private_data); -+ -+ if (UNNEGOTIATED == opener->type) { -+ spin_lock(&dev->lock); -+ -+ if (dev->ready_for_output) { -+ err = vidioc_streamon(file, file->private_data, -+ V4L2_BUF_TYPE_VIDEO_OUTPUT); -+ } -+ -+ spin_unlock(&dev->lock); -+ -+ if (err < 0) -+ return err; -+ } -+ -+ if (WRITER != opener->type) -+ return -EINVAL; -+ -+ if (!dev->ready_for_capture) { -+ int ret = allocate_buffers(dev); -+ if (ret < 0) -+ return ret; -+ dev->ready_for_capture = 1; -+ } -+ dprintkrw("v4l2_loopback_write() trying to write %zu bytes\n", count); -+ if (count > dev->buffer_size) -+ count = dev->buffer_size; -+ -+ write_index = dev->write_position % dev->used_buffers; -+ b = &dev->buffers[write_index].buffer; -+ -+ if (copy_from_user((void *)(dev->image + b->m.offset), (void *)buf, -+ count)) { -+ printk(KERN_ERR -+ "v4l2-loopback: failed copy_from_user() in write buf, could not write %zu\n", -+ count); -+ return -EFAULT; -+ } -+ v4l2l_get_timestamp(b); -+ b->bytesused = count; -+ b->sequence = dev->write_position; -+ buffer_written(dev, &dev->buffers[write_index]); -+ wake_up_all(&dev->read_event); -+ dprintkrw("leave v4l2_loopback_write()\n"); -+ return count; -+} -+ -+/* init functions */ -+/* frees buffers, if already allocated */ -+static void free_buffers(struct v4l2_loopback_device *dev) -+{ -+ MARK(); -+ dprintk("freeing image@%p for dev:%p\n", dev ? dev->image : NULL, dev); -+ if (!dev) -+ return; -+ if (dev->image) { -+ vfree(dev->image); -+ dev->image = NULL; -+ } -+ if (dev->timeout_image) { -+ vfree(dev->timeout_image); -+ dev->timeout_image = NULL; -+ } -+ dev->imagesize = 0; -+} -+/* frees buffers, if they are no longer needed */ -+static void try_free_buffers(struct v4l2_loopback_device *dev) -+{ -+ MARK(); -+ if (0 == dev->open_count.counter && !dev->keep_format) { -+ free_buffers(dev); -+ dev->ready_for_capture = 0; -+ dev->buffer_size = 0; -+ dev->write_position = 0; -+ } -+} -+/* allocates buffers, if buffer_size is set */ -+static int allocate_buffers(struct v4l2_loopback_device *dev) -+{ -+ int err; -+ -+ MARK(); -+ /* vfree on close file operation in case no open handles left */ -+ -+ if (dev->buffer_size < 1 || dev->buffers_number < 1) -+ return -EINVAL; -+ -+ if ((__LONG_MAX__ / dev->buffer_size) < dev->buffers_number) -+ return -ENOSPC; -+ -+ if (dev->image) { -+ dprintk("allocating buffers again: %ld %ld\n", -+ dev->buffer_size * dev->buffers_number, dev->imagesize); -+ /* FIXME: prevent double allocation more intelligently! */ -+ if (dev->buffer_size * dev->buffers_number == dev->imagesize) -+ return 0; -+ -+ /* if there is only one writer, no problem should occur */ -+ if (dev->open_count.counter == 1) -+ free_buffers(dev); -+ else -+ return -EINVAL; -+ } -+ -+ dev->imagesize = (unsigned long)dev->buffer_size * -+ (unsigned long)dev->buffers_number; -+ -+ dprintk("allocating %ld = %ldx%d\n", dev->imagesize, dev->buffer_size, -+ dev->buffers_number); -+ err = -ENOMEM; -+ -+ if (dev->timeout_jiffies > 0) { -+ err = allocate_timeout_image(dev); -+ if (err < 0) -+ goto error; -+ } -+ -+ dev->image = vmalloc(dev->imagesize); -+ if (dev->image == NULL) -+ goto error; -+ -+ dprintk("vmallocated %ld bytes\n", dev->imagesize); -+ MARK(); -+ -+ init_buffers(dev); -+ return 0; -+ -+error: -+ free_buffers(dev); -+ return err; -+} -+ -+/* init inner buffers, they are capture mode and flags are set as -+ * for capture mod buffers */ -+static void init_buffers(struct v4l2_loopback_device *dev) -+{ -+ int i; -+ int buffer_size; -+ int bytesused; -+ MARK(); -+ -+ buffer_size = dev->buffer_size; -+ bytesused = dev->pix_format.sizeimage; -+ for (i = 0; i < dev->buffers_number; ++i) { -+ struct v4l2_buffer *b = &dev->buffers[i].buffer; -+ b->index = i; -+ b->bytesused = bytesused; -+ b->length = buffer_size; -+ b->field = V4L2_FIELD_NONE; -+ b->flags = 0; -+ b->m.offset = i * buffer_size; -+ b->memory = V4L2_MEMORY_MMAP; -+ b->sequence = 0; -+ b->timestamp.tv_sec = 0; -+ b->timestamp.tv_usec = 0; -+ b->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; -+ -+ v4l2l_get_timestamp(b); -+ } -+ dev->timeout_image_buffer = dev->buffers[0]; -+ dev->timeout_image_buffer.buffer.m.offset = MAX_BUFFERS * buffer_size; -+ MARK(); -+} -+ -+static int allocate_timeout_image(struct v4l2_loopback_device *dev) -+{ -+ MARK(); -+ if (dev->buffer_size <= 0) { -+ dev->timeout_image_io = 0; -+ return -EINVAL; -+ } -+ -+ if (dev->timeout_image == NULL) { -+ dev->timeout_image = vzalloc(dev->buffer_size); -+ if (dev->timeout_image == NULL) { -+ dev->timeout_image_io = 0; -+ return -ENOMEM; -+ } -+ } -+ return 0; -+} -+ -+/* fills and register video device */ -+static void init_vdev(struct video_device *vdev, int nr) -+{ -+ MARK(); -+ -+#ifdef V4L2LOOPBACK_WITH_STD -+ vdev->tvnorms = V4L2_STD_ALL; -+#endif /* V4L2LOOPBACK_WITH_STD */ -+ -+ vdev->vfl_type = VFL_TYPE_VIDEO; -+ vdev->fops = &v4l2_loopback_fops; -+ vdev->ioctl_ops = &v4l2_loopback_ioctl_ops; -+ vdev->release = &video_device_release; -+ vdev->minor = -1; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) -+ vdev->device_caps = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_VIDEO_CAPTURE | -+ V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_READWRITE | -+ V4L2_CAP_STREAMING; -+#endif -+ -+ if (debug > 1) -+ vdev->dev_debug = V4L2_DEV_DEBUG_IOCTL | -+ V4L2_DEV_DEBUG_IOCTL_ARG; -+ -+ vdev->vfl_dir = VFL_DIR_M2M; -+ -+ MARK(); -+} -+ -+/* init default capture parameters, only fps may be changed in future */ -+static void init_capture_param(struct v4l2_captureparm *capture_param) -+{ -+ MARK(); -+ capture_param->capability = 0; -+ capture_param->capturemode = 0; -+ capture_param->extendedmode = 0; -+ capture_param->readbuffers = max_buffers; -+ capture_param->timeperframe.numerator = 1; -+ capture_param->timeperframe.denominator = 30; -+} -+ -+static void check_timers(struct v4l2_loopback_device *dev) -+{ -+ if (!dev->ready_for_capture) -+ return; -+ -+ if (dev->timeout_jiffies > 0 && !timer_pending(&dev->timeout_timer)) -+ mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies); -+ if (dev->sustain_framerate && !timer_pending(&dev->sustain_timer)) -+ mod_timer(&dev->sustain_timer, -+ jiffies + dev->frame_jiffies * 3 / 2); -+} -+#ifdef HAVE_TIMER_SETUP -+static void sustain_timer_clb(struct timer_list *t) -+{ -+ struct v4l2_loopback_device *dev = from_timer(dev, t, sustain_timer); -+#else -+static void sustain_timer_clb(unsigned long nr) -+{ -+ struct v4l2_loopback_device *dev = -+ idr_find(&v4l2loopback_index_idr, nr); -+#endif -+ spin_lock(&dev->lock); -+ if (dev->sustain_framerate) { -+ dev->reread_count++; -+ dprintkrw("reread: %lld %d\n", (long long)dev->write_position, -+ dev->reread_count); -+ if (dev->reread_count == 1) -+ mod_timer(&dev->sustain_timer, -+ jiffies + max(1UL, dev->frame_jiffies / 2)); -+ else -+ mod_timer(&dev->sustain_timer, -+ jiffies + dev->frame_jiffies); -+ wake_up_all(&dev->read_event); -+ } -+ spin_unlock(&dev->lock); -+} -+#ifdef HAVE_TIMER_SETUP -+static void timeout_timer_clb(struct timer_list *t) -+{ -+ struct v4l2_loopback_device *dev = from_timer(dev, t, timeout_timer); -+#else -+static void timeout_timer_clb(unsigned long nr) -+{ -+ struct v4l2_loopback_device *dev = -+ idr_find(&v4l2loopback_index_idr, nr); -+#endif -+ spin_lock(&dev->lock); -+ if (dev->timeout_jiffies > 0) { -+ dev->timeout_happened = 1; -+ mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies); -+ wake_up_all(&dev->read_event); -+ } -+ spin_unlock(&dev->lock); -+} -+ -+/* init loopback main structure */ -+#define DEFAULT_FROM_CONF(confmember, default_condition, default_value) \ -+ ((conf) ? \ -+ ((conf->confmember default_condition) ? (default_value) : \ -+ (conf->confmember)) : \ -+ default_value) -+ -+static int v4l2_loopback_add(struct v4l2_loopback_config *conf, int *ret_nr) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_ctrl_handler *hdl; -+ struct v4l2loopback_private *vdev_priv = NULL; -+ -+ int err = -ENOMEM; -+ -+ u32 _width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; -+ u32 _height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; -+ -+ u32 _min_width = DEFAULT_FROM_CONF(min_width, -+ < V4L2LOOPBACK_SIZE_MIN_WIDTH, -+ V4L2LOOPBACK_SIZE_MIN_WIDTH); -+ u32 _min_height = DEFAULT_FROM_CONF(min_height, -+ < V4L2LOOPBACK_SIZE_MIN_HEIGHT, -+ V4L2LOOPBACK_SIZE_MIN_HEIGHT); -+ u32 _max_width = DEFAULT_FROM_CONF(max_width, < _min_width, max_width); -+ u32 _max_height = -+ DEFAULT_FROM_CONF(max_height, < _min_height, max_height); -+ bool _announce_all_caps = (conf && conf->announce_all_caps >= 0) ? -+ (conf->announce_all_caps) : -+ V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS; -+ int _max_buffers = DEFAULT_FROM_CONF(max_buffers, <= 0, max_buffers); -+ int _max_openers = DEFAULT_FROM_CONF(max_openers, <= 0, max_openers); -+ -+ int nr = -1; -+ -+ _announce_all_caps = (!!_announce_all_caps); -+ -+ if (conf) { -+ const int output_nr = conf->output_nr; -+#ifdef SPLIT_DEVICES -+ const int capture_nr = conf->capture_nr; -+#else -+ const int capture_nr = output_nr; -+#endif -+ if (capture_nr >= 0 && output_nr == capture_nr) { -+ nr = output_nr; -+ } else if (capture_nr < 0 && output_nr < 0) { -+ nr = -1; -+ } else if (capture_nr < 0) { -+ nr = output_nr; -+ } else if (output_nr < 0) { -+ nr = capture_nr; -+ } else { -+ printk(KERN_ERR -+ "split OUTPUT and CAPTURE devices not yet supported."); -+ printk(KERN_INFO -+ "both devices must have the same number (%d != %d).", -+ output_nr, capture_nr); -+ return -EINVAL; -+ } -+ } -+ -+ if (idr_find(&v4l2loopback_index_idr, nr)) -+ return -EEXIST; -+ -+ dprintk("creating v4l2loopback-device #%d\n", nr); -+ dev = kzalloc(sizeof(*dev), GFP_KERNEL); -+ if (!dev) -+ return -ENOMEM; -+ -+ /* allocate id, if @id >= 0, we're requesting that specific id */ -+ if (nr >= 0) { -+ err = idr_alloc(&v4l2loopback_index_idr, dev, nr, nr + 1, -+ GFP_KERNEL); -+ if (err == -ENOSPC) -+ err = -EEXIST; -+ } else { -+ err = idr_alloc(&v4l2loopback_index_idr, dev, 0, 0, GFP_KERNEL); -+ } -+ if (err < 0) -+ goto out_free_dev; -+ nr = err; -+ err = -ENOMEM; -+ -+ if (conf && conf->card_label[0]) { -+ snprintf(dev->card_label, sizeof(dev->card_label), "%s", -+ conf->card_label); -+ } else { -+ snprintf(dev->card_label, sizeof(dev->card_label), -+ "Dummy video device (0x%04X)", nr); -+ } -+ snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), -+ "v4l2loopback-%03d", nr); -+ -+ err = v4l2_device_register(NULL, &dev->v4l2_dev); -+ if (err) -+ goto out_free_idr; -+ MARK(); -+ -+ dev->vdev = video_device_alloc(); -+ if (dev->vdev == NULL) { -+ err = -ENOMEM; -+ goto out_unregister; -+ } -+ -+ vdev_priv = kzalloc(sizeof(struct v4l2loopback_private), GFP_KERNEL); -+ if (vdev_priv == NULL) { -+ err = -ENOMEM; -+ goto out_unregister; -+ } -+ -+ video_set_drvdata(dev->vdev, vdev_priv); -+ if (video_get_drvdata(dev->vdev) == NULL) { -+ err = -ENOMEM; -+ goto out_unregister; -+ } -+ -+ MARK(); -+ snprintf(dev->vdev->name, sizeof(dev->vdev->name), "%s", -+ dev->card_label); -+ -+ vdev_priv->device_nr = nr; -+ -+ init_vdev(dev->vdev, nr); -+ dev->vdev->v4l2_dev = &dev->v4l2_dev; -+ init_capture_param(&dev->capture_param); -+ err = set_timeperframe(dev, &dev->capture_param.timeperframe); -+ if (err) -+ goto out_unregister; -+ dev->keep_format = 0; -+ dev->sustain_framerate = 0; -+ -+ dev->announce_all_caps = _announce_all_caps; -+ dev->min_width = _min_width; -+ dev->min_height = _min_height; -+ dev->max_width = _max_width; -+ dev->max_height = _max_height; -+ dev->max_openers = _max_openers; -+ dev->buffers_number = dev->used_buffers = _max_buffers; -+ -+ dev->write_position = 0; -+ -+ MARK(); -+ spin_lock_init(&dev->lock); -+ spin_lock_init(&dev->list_lock); -+ INIT_LIST_HEAD(&dev->outbufs_list); -+ if (list_empty(&dev->outbufs_list)) { -+ int i; -+ -+ for (i = 0; i < dev->used_buffers; ++i) -+ list_add_tail(&dev->buffers[i].list_head, -+ &dev->outbufs_list); -+ } -+ memset(dev->bufpos2index, 0, sizeof(dev->bufpos2index)); -+ atomic_set(&dev->open_count, 0); -+ dev->ready_for_capture = 0; -+ dev->ready_for_output = 1; -+ -+ dev->buffer_size = 0; -+ dev->image = NULL; -+ dev->imagesize = 0; -+#ifdef HAVE_TIMER_SETUP -+ timer_setup(&dev->sustain_timer, sustain_timer_clb, 0); -+ timer_setup(&dev->timeout_timer, timeout_timer_clb, 0); -+#else -+ setup_timer(&dev->sustain_timer, sustain_timer_clb, nr); -+ setup_timer(&dev->timeout_timer, timeout_timer_clb, nr); -+#endif -+ dev->reread_count = 0; -+ dev->timeout_jiffies = 0; -+ dev->timeout_image = NULL; -+ dev->timeout_happened = 0; -+ -+ hdl = &dev->ctrl_handler; -+ err = v4l2_ctrl_handler_init(hdl, 4); -+ if (err) -+ goto out_unregister; -+ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_keepformat, NULL); -+ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_sustainframerate, NULL); -+ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeout, NULL); -+ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeoutimageio, NULL); -+ if (hdl->error) { -+ err = hdl->error; -+ goto out_free_handler; -+ } -+ dev->v4l2_dev.ctrl_handler = hdl; -+ -+ err = v4l2_ctrl_handler_setup(hdl); -+ if (err) -+ goto out_free_handler; -+ -+ /* FIXME set buffers to 0 */ -+ -+ /* Set initial format */ -+ if (_width < _min_width) -+ _width = _min_width; -+ if (_width > _max_width) -+ _width = _max_width; -+ if (_height < _min_height) -+ _height = _min_height; -+ if (_height > _max_height) -+ _height = _max_height; -+ -+ dev->pix_format.width = _width; -+ dev->pix_format.height = _height; -+ dev->pix_format.pixelformat = formats[0].fourcc; -+ dev->pix_format.colorspace = -+ V4L2_COLORSPACE_DEFAULT; /* do we need to set this ? */ -+ dev->pix_format.field = V4L2_FIELD_NONE; -+ -+ dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage); -+ dprintk("buffer_size = %ld (=%d)\n", dev->buffer_size, -+ dev->pix_format.sizeimage); -+ -+ if (dev->buffer_size && ((err = allocate_buffers(dev)) < 0)) -+ goto out_free_handler; -+ -+ init_waitqueue_head(&dev->read_event); -+ -+ /* register the device -> it creates /dev/video* */ -+ if (video_register_device(dev->vdev, VFL_TYPE_VIDEO, nr) < 0) { -+ printk(KERN_ERR -+ "v4l2loopback: failed video_register_device()\n"); -+ err = -EFAULT; -+ goto out_free_device; -+ } -+ v4l2loopback_create_sysfs(dev->vdev); -+ -+ MARK(); -+ if (ret_nr) -+ *ret_nr = dev->vdev->num; -+ return 0; -+ -+out_free_device: -+ video_device_release(dev->vdev); -+out_free_handler: -+ v4l2_ctrl_handler_free(&dev->ctrl_handler); -+out_unregister: -+ video_set_drvdata(dev->vdev, NULL); -+ if (vdev_priv != NULL) -+ kfree(vdev_priv); -+ v4l2_device_unregister(&dev->v4l2_dev); -+out_free_idr: -+ idr_remove(&v4l2loopback_index_idr, nr); -+out_free_dev: -+ kfree(dev); -+ return err; -+} -+ -+static void v4l2_loopback_remove(struct v4l2_loopback_device *dev) -+{ -+ free_buffers(dev); -+ v4l2loopback_remove_sysfs(dev->vdev); -+ kfree(video_get_drvdata(dev->vdev)); -+ video_unregister_device(dev->vdev); -+ v4l2_device_unregister(&dev->v4l2_dev); -+ v4l2_ctrl_handler_free(&dev->ctrl_handler); -+ kfree(dev); -+} -+ -+static long v4l2loopback_control_ioctl(struct file *file, unsigned int cmd, -+ unsigned long parm) -+{ -+ struct v4l2_loopback_device *dev; -+ struct v4l2_loopback_config conf; -+ struct v4l2_loopback_config *confptr = &conf; -+ int device_nr, capture_nr, output_nr; -+ int ret; -+ -+ ret = mutex_lock_killable(&v4l2loopback_ctl_mutex); -+ if (ret) -+ return ret; -+ -+ ret = -EINVAL; -+ switch (cmd) { -+ default: -+ ret = -ENOSYS; -+ break; -+ /* add a v4l2loopback device (pair), based on the user-provided specs */ -+ case V4L2LOOPBACK_CTL_ADD: -+ if (parm) { -+ if ((ret = copy_from_user(&conf, (void *)parm, -+ sizeof(conf))) < 0) -+ break; -+ } else -+ confptr = NULL; -+ ret = v4l2_loopback_add(confptr, &device_nr); -+ if (ret >= 0) -+ ret = device_nr; -+ break; -+ /* remove a v4l2loopback device (both capture and output) */ -+ case V4L2LOOPBACK_CTL_REMOVE: -+ ret = v4l2loopback_lookup((int)parm, &dev); -+ if (ret >= 0 && dev) { -+ int nr = ret; -+ ret = -EBUSY; -+ if (dev->open_count.counter > 0) -+ break; -+ idr_remove(&v4l2loopback_index_idr, nr); -+ v4l2_loopback_remove(dev); -+ ret = 0; -+ }; -+ break; -+ /* get information for a loopback device. -+ * this is mostly about limits (which cannot be queried directly with VIDIOC_G_FMT and friends -+ */ -+ case V4L2LOOPBACK_CTL_QUERY: -+ if (!parm) -+ break; -+ if ((ret = copy_from_user(&conf, (void *)parm, sizeof(conf))) < -+ 0) -+ break; -+ capture_nr = output_nr = conf.output_nr; -+#ifdef SPLIT_DEVICES -+ capture_nr = conf.capture_nr; -+#endif -+ device_nr = (output_nr < 0) ? capture_nr : output_nr; -+ MARK(); -+ /* get the device from either capture_nr or output_nr (whatever is valid) */ -+ if ((ret = v4l2loopback_lookup(device_nr, &dev)) < 0) -+ break; -+ MARK(); -+ /* if we got the device from output_nr and there is a valid capture_nr, -+ * make sure that both refer to the same device (or bail out) -+ */ -+ if ((device_nr != capture_nr) && (capture_nr >= 0) && -+ ((ret = v4l2loopback_lookup(capture_nr, 0)) < 0)) -+ break; -+ MARK(); -+ /* if otoh, we got the device from capture_nr and there is a valid output_nr, -+ * make sure that both refer to the same device (or bail out) -+ */ -+ if ((device_nr != output_nr) && (output_nr >= 0) && -+ ((ret = v4l2loopback_lookup(output_nr, 0)) < 0)) -+ break; -+ MARK(); -+ -+ /* v4l2_loopback_config identified a single device, so fetch the data */ -+ snprintf(conf.card_label, sizeof(conf.card_label), "%s", -+ dev->card_label); -+ MARK(); -+ conf.output_nr = dev->vdev->num; -+#ifdef SPLIT_DEVICES -+ conf.capture_nr = dev->vdev->num; -+#endif -+ conf.min_width = dev->min_width; -+ conf.min_height = dev->min_height; -+ conf.max_width = dev->max_width; -+ conf.max_height = dev->max_height; -+ conf.announce_all_caps = dev->announce_all_caps; -+ conf.max_buffers = dev->buffers_number; -+ conf.max_openers = dev->max_openers; -+ conf.debug = debug; -+ MARK(); -+ if (copy_to_user((void *)parm, &conf, sizeof(conf))) { -+ ret = -EFAULT; -+ break; -+ } -+ MARK(); -+ ret = 0; -+ ; -+ break; -+ } -+ -+ MARK(); -+ mutex_unlock(&v4l2loopback_ctl_mutex); -+ MARK(); -+ return ret; -+} -+ -+/* LINUX KERNEL */ -+ -+static const struct file_operations v4l2loopback_ctl_fops = { -+ // clang-format off -+ .owner = THIS_MODULE, -+ .open = nonseekable_open, -+ .unlocked_ioctl = v4l2loopback_control_ioctl, -+ .compat_ioctl = v4l2loopback_control_ioctl, -+ .llseek = noop_llseek, -+ // clang-format on -+}; -+ -+static struct miscdevice v4l2loopback_misc = { -+ // clang-format off -+ .minor = MISC_DYNAMIC_MINOR, -+ .name = "v4l2loopback", -+ .fops = &v4l2loopback_ctl_fops, -+ // clang-format on -+}; -+ -+static const struct v4l2_file_operations v4l2_loopback_fops = { -+ // clang-format off -+ .owner = THIS_MODULE, -+ .open = v4l2_loopback_open, -+ .release = v4l2_loopback_close, -+ .read = v4l2_loopback_read, -+ .write = v4l2_loopback_write, -+ .poll = v4l2_loopback_poll, -+ .mmap = v4l2_loopback_mmap, -+ .unlocked_ioctl = video_ioctl2, -+ // clang-format on -+}; -+ -+static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops = { -+ // clang-format off -+ .vidioc_querycap = &vidioc_querycap, -+ .vidioc_enum_framesizes = &vidioc_enum_framesizes, -+ .vidioc_enum_frameintervals = &vidioc_enum_frameintervals, -+ -+ .vidioc_enum_output = &vidioc_enum_output, -+ .vidioc_g_output = &vidioc_g_output, -+ .vidioc_s_output = &vidioc_s_output, -+ -+ .vidioc_enum_input = &vidioc_enum_input, -+ .vidioc_g_input = &vidioc_g_input, -+ .vidioc_s_input = &vidioc_s_input, -+ -+ .vidioc_enum_fmt_vid_cap = &vidioc_enum_fmt_cap, -+ .vidioc_g_fmt_vid_cap = &vidioc_g_fmt_cap, -+ .vidioc_s_fmt_vid_cap = &vidioc_s_fmt_cap, -+ .vidioc_try_fmt_vid_cap = &vidioc_try_fmt_cap, -+ -+ .vidioc_enum_fmt_vid_out = &vidioc_enum_fmt_out, -+ .vidioc_s_fmt_vid_out = &vidioc_s_fmt_out, -+ .vidioc_g_fmt_vid_out = &vidioc_g_fmt_out, -+ .vidioc_try_fmt_vid_out = &vidioc_try_fmt_out, -+ -+#ifdef V4L2L_OVERLAY -+ .vidioc_s_fmt_vid_overlay = &vidioc_s_fmt_overlay, -+ .vidioc_g_fmt_vid_overlay = &vidioc_g_fmt_overlay, -+#endif -+ -+#ifdef V4L2LOOPBACK_WITH_STD -+ .vidioc_s_std = &vidioc_s_std, -+ .vidioc_g_std = &vidioc_g_std, -+ .vidioc_querystd = &vidioc_querystd, -+#endif /* V4L2LOOPBACK_WITH_STD */ -+ -+ .vidioc_g_parm = &vidioc_g_parm, -+ .vidioc_s_parm = &vidioc_s_parm, -+ -+ .vidioc_reqbufs = &vidioc_reqbufs, -+ .vidioc_querybuf = &vidioc_querybuf, -+ .vidioc_qbuf = &vidioc_qbuf, -+ .vidioc_dqbuf = &vidioc_dqbuf, -+ -+ .vidioc_streamon = &vidioc_streamon, -+ .vidioc_streamoff = &vidioc_streamoff, -+ -+#ifdef CONFIG_VIDEO_V4L1_COMPAT -+ .vidiocgmbuf = &vidiocgmbuf, -+#endif -+ -+ .vidioc_subscribe_event = &vidioc_subscribe_event, -+ .vidioc_unsubscribe_event = &v4l2_event_unsubscribe, -+ // clang-format on -+}; -+ -+static int free_device_cb(int id, void *ptr, void *data) -+{ -+ struct v4l2_loopback_device *dev = ptr; -+ v4l2_loopback_remove(dev); -+ return 0; -+} -+static void free_devices(void) -+{ -+ idr_for_each(&v4l2loopback_index_idr, &free_device_cb, NULL); -+ idr_destroy(&v4l2loopback_index_idr); -+} -+ -+static int __init v4l2loopback_init_module(void) -+{ -+ const u32 min_width = V4L2LOOPBACK_SIZE_MIN_WIDTH; -+ const u32 min_height = V4L2LOOPBACK_SIZE_MIN_HEIGHT; -+ int err; -+ int i; -+ MARK(); -+ -+ err = misc_register(&v4l2loopback_misc); -+ if (err < 0) -+ return err; -+ -+ if (devices < 0) { -+ devices = 1; -+ -+ /* try guessing the devices from the "video_nr" parameter */ -+ for (i = MAX_DEVICES - 1; i >= 0; i--) { -+ if (video_nr[i] >= 0) { -+ devices = i + 1; -+ break; -+ } -+ } -+ } -+ -+ if (devices > MAX_DEVICES) { -+ devices = MAX_DEVICES; -+ printk(KERN_INFO -+ "v4l2loopback: number of initial devices is limited to: %d\n", -+ MAX_DEVICES); -+ } -+ -+ if (max_buffers > MAX_BUFFERS) { -+ max_buffers = MAX_BUFFERS; -+ printk(KERN_INFO -+ "v4l2loopback: number of buffers is limited to: %d\n", -+ MAX_BUFFERS); -+ } -+ -+ if (max_openers < 0) { -+ printk(KERN_INFO -+ "v4l2loopback: allowing %d openers rather than %d\n", -+ 2, max_openers); -+ max_openers = 2; -+ } -+ -+ if (max_width < min_width) { -+ max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH; -+ printk(KERN_INFO "v4l2loopback: using max_width %d\n", -+ max_width); -+ } -+ if (max_height < min_height) { -+ max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT; -+ printk(KERN_INFO "v4l2loopback: using max_height %d\n", -+ max_height); -+ } -+ -+ for (i = 0; i < devices; i++) { -+ struct v4l2_loopback_config cfg = { -+ // clang-format off -+ .output_nr = video_nr[i], -+#ifdef SPLIT_DEVICES -+ .capture_nr = video_nr[i], -+#endif -+ .min_width = min_width, -+ .min_height = min_height, -+ .max_width = max_width, -+ .max_height = max_height, -+ .announce_all_caps = (!exclusive_caps[i]), -+ .max_buffers = max_buffers, -+ .max_openers = max_openers, -+ .debug = debug, -+ // clang-format on -+ }; -+ cfg.card_label[0] = 0; -+ if (card_label[i]) -+ snprintf(cfg.card_label, sizeof(cfg.card_label), "%s", -+ card_label[i]); -+ err = v4l2_loopback_add(&cfg, 0); -+ if (err) { -+ free_devices(); -+ goto error; -+ } -+ } -+ -+ dprintk("module installed\n"); -+ -+ printk(KERN_INFO "v4l2loopback driver version %d.%d.%d%s loaded\n", -+ // clang-format off -+ (V4L2LOOPBACK_VERSION_CODE >> 16) & 0xff, -+ (V4L2LOOPBACK_VERSION_CODE >> 8) & 0xff, -+ (V4L2LOOPBACK_VERSION_CODE ) & 0xff, -+#ifdef SNAPSHOT_VERSION -+ " (" __stringify(SNAPSHOT_VERSION) ")" -+#else -+ "" -+#endif -+ ); -+ // clang-format on -+ -+ return 0; -+error: -+ misc_deregister(&v4l2loopback_misc); -+ return err; -+} -+ -+static void v4l2loopback_cleanup_module(void) -+{ -+ MARK(); -+ /* unregister the device -> it deletes /dev/video* */ -+ free_devices(); -+ /* and get rid of /dev/v4l2loopback */ -+ misc_deregister(&v4l2loopback_misc); -+ dprintk("module removed\n"); -+} -+ -+MODULE_ALIAS_MISCDEV(MISC_DYNAMIC_MINOR); -+ -+module_init(v4l2loopback_init_module); -+module_exit(v4l2loopback_cleanup_module); -diff --git a/drivers/media/v4l2-core/v4l2loopback.h b/drivers/media/v4l2-core/v4l2loopback.h -new file mode 100644 -index 000000000000..18f2f376e7ec ---- /dev/null -+++ b/drivers/media/v4l2-core/v4l2loopback.h -@@ -0,0 +1,98 @@ -+/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ -+/* -+ * v4l2loopback.h -+ * -+ * Written by IOhannes m zmölnig, 7/1/20. -+ * -+ * Copyright 2020 by IOhannes m zmölnig. Redistribution of this file is -+ * permitted under the GNU General Public License. -+ */ -+#ifndef _V4L2LOOPBACK_H -+#define _V4L2LOOPBACK_H -+ -+#define V4L2LOOPBACK_VERSION_MAJOR 0 -+#define V4L2LOOPBACK_VERSION_MINOR 12 -+#define V4L2LOOPBACK_VERSION_BUGFIX 7 -+ -+/* /dev/v4l2loopback interface */ -+ -+struct v4l2_loopback_config { -+ /** -+ * the device-number (/dev/video) -+ * V4L2LOOPBACK_CTL_ADD: -+ * setting this to a value<0, will allocate an available one -+ * if nr>=0 and the device already exists, the ioctl will EEXIST -+ * if output_nr and capture_nr are the same, only a single device will be created -+ * NOTE: currently split-devices (where output_nr and capture_nr differ) -+ * are not implemented yet. -+ * until then, requesting different device-IDs will result in EINVAL. -+ * -+ * V4L2LOOPBACK_CTL_QUERY: -+ * either both output_nr and capture_nr must refer to the same loopback, -+ * or one (and only one) of them must be -1 -+ * -+ */ -+ int output_nr; -+ int unused; /*capture_nr;*/ -+ -+ /** -+ * a nice name for your device -+ * if (*card_label)==0, an automatic name is assigned -+ */ -+ char card_label[32]; -+ -+ /** -+ * allowed frame size -+ * if too low, default values are used -+ */ -+ unsigned int min_width; -+ unsigned int max_width; -+ unsigned int min_height; -+ unsigned int max_height; -+ -+ /** -+ * number of buffers to allocate for the queue -+ * if set to <=0, default values are used -+ */ -+ int max_buffers; -+ -+ /** -+ * how many consumers are allowed to open this device concurrently -+ * if set to <=0, default values are used -+ */ -+ int max_openers; -+ -+ /** -+ * set the debugging level for this device -+ */ -+ int debug; -+ -+ /** -+ * whether to announce OUTPUT/CAPTURE capabilities exclusively -+ * for this device or not -+ * (!exclusive_caps) -+ * NOTE: this is going to be removed once separate output/capture -+ * devices are implemented -+ */ -+ int announce_all_caps; -+}; -+ -+/* a pointer to a (struct v4l2_loopback_config) that has all values you wish to impose on the -+ * to-be-created device set. -+ * if the ptr is NULL, a new device is created with default values at the driver's discretion. -+ * -+ * returns the device_nr of the OUTPUT device (which can be used with V4L2LOOPBACK_CTL_QUERY, -+ * to get more information on the device) -+ */ -+#define V4L2LOOPBACK_CTL_ADD 0x4C80 -+ -+/* a pointer to a (struct v4l2_loopback_config) that has output_nr and/or capture_nr set -+ * (the two values must either refer to video-devices associated with the same loopback device -+ * or exactly one of them must be <0 -+ */ -+#define V4L2LOOPBACK_CTL_QUERY 0x4C82 -+ -+/* the device-number (either CAPTURE or OUTPUT) associated with the loopback-device */ -+#define V4L2LOOPBACK_CTL_REMOVE 0x4C81 -+ -+#endif /* _V4L2LOOPBACK_H */ -diff --git a/drivers/media/v4l2-core/v4l2loopback_formats.h b/drivers/media/v4l2-core/v4l2loopback_formats.h -new file mode 100644 -index 000000000000..d855a3796554 ---- /dev/null -+++ b/drivers/media/v4l2-core/v4l2loopback_formats.h -@@ -0,0 +1,445 @@ -+static const struct v4l2l_format formats[] = { -+#ifndef V4L2_PIX_FMT_VP9 -+#define V4L2_PIX_FMT_VP9 v4l2_fourcc('V', 'P', '9', '0') -+#endif -+#ifndef V4L2_PIX_FMT_HEVC -+#define V4L2_PIX_FMT_HEVC v4l2_fourcc('H', 'E', 'V', 'C') -+#endif -+ -+ /* here come the packed formats */ -+ { -+ .name = "32 bpp RGB, le", -+ .fourcc = V4L2_PIX_FMT_BGR32, -+ .depth = 32, -+ .flags = 0, -+ }, -+ { -+ .name = "32 bpp RGB, be", -+ .fourcc = V4L2_PIX_FMT_RGB32, -+ .depth = 32, -+ .flags = 0, -+ }, -+ { -+ .name = "24 bpp RGB, le", -+ .fourcc = V4L2_PIX_FMT_BGR24, -+ .depth = 24, -+ .flags = 0, -+ }, -+ { -+ .name = "24 bpp RGB, be", -+ .fourcc = V4L2_PIX_FMT_RGB24, -+ .depth = 24, -+ .flags = 0, -+ }, -+#ifdef V4L2_PIX_FMT_ABGR32 -+ { -+ .name = "32 bpp RGBA, le", -+ .fourcc = V4L2_PIX_FMT_ABGR32, -+ .depth = 32, -+ .flags = 0, -+ }, -+#endif -+#ifdef V4L2_PIX_FMT_RGBA32 -+ { -+ .name = "32 bpp RGBA", -+ .fourcc = V4L2_PIX_FMT_RGBA32, -+ .depth = 32, -+ .flags = 0, -+ }, -+#endif -+#ifdef V4L2_PIX_FMT_RGB332 -+ { -+ .name = "8 bpp RGB-3-3-2", -+ .fourcc = V4L2_PIX_FMT_RGB332, -+ .depth = 8, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB332 */ -+#ifdef V4L2_PIX_FMT_RGB444 -+ { -+ .name = "16 bpp RGB (xxxxrrrr ggggbbbb)", -+ .fourcc = V4L2_PIX_FMT_RGB444, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB444 */ -+#ifdef V4L2_PIX_FMT_RGB555 -+ { -+ .name = "16 bpp RGB-5-5-5", -+ .fourcc = V4L2_PIX_FMT_RGB555, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB555 */ -+#ifdef V4L2_PIX_FMT_RGB565 -+ { -+ .name = "16 bpp RGB-5-6-5", -+ .fourcc = V4L2_PIX_FMT_RGB565, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB565 */ -+#ifdef V4L2_PIX_FMT_RGB555X -+ { -+ .name = "16 bpp RGB-5-5-5 BE", -+ .fourcc = V4L2_PIX_FMT_RGB555X, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB555X */ -+#ifdef V4L2_PIX_FMT_RGB565X -+ { -+ .name = "16 bpp RGB-5-6-5 BE", -+ .fourcc = V4L2_PIX_FMT_RGB565X, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_RGB565X */ -+#ifdef V4L2_PIX_FMT_BGR666 -+ { -+ .name = "18 bpp BGR-6-6-6", -+ .fourcc = V4L2_PIX_FMT_BGR666, -+ .depth = 18, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_BGR666 */ -+ { -+ .name = "4:2:2, packed, YUYV", -+ .fourcc = V4L2_PIX_FMT_YUYV, -+ .depth = 16, -+ .flags = 0, -+ }, -+ { -+ .name = "4:2:2, packed, UYVY", -+ .fourcc = V4L2_PIX_FMT_UYVY, -+ .depth = 16, -+ .flags = 0, -+ }, -+#ifdef V4L2_PIX_FMT_YVYU -+ { -+ .name = "4:2:2, packed YVYU", -+ .fourcc = V4L2_PIX_FMT_YVYU, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif -+#ifdef V4L2_PIX_FMT_VYUY -+ { -+ .name = "4:2:2, packed VYUY", -+ .fourcc = V4L2_PIX_FMT_VYUY, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif -+ { -+ .name = "4:2:2, packed YYUV", -+ .fourcc = V4L2_PIX_FMT_YYUV, -+ .depth = 16, -+ .flags = 0, -+ }, -+ { -+ .name = "YUV-8-8-8-8", -+ .fourcc = V4L2_PIX_FMT_YUV32, -+ .depth = 32, -+ .flags = 0, -+ }, -+ { -+ .name = "8 bpp, Greyscale", -+ .fourcc = V4L2_PIX_FMT_GREY, -+ .depth = 8, -+ .flags = 0, -+ }, -+#ifdef V4L2_PIX_FMT_Y4 -+ { -+ .name = "4 bpp Greyscale", -+ .fourcc = V4L2_PIX_FMT_Y4, -+ .depth = 4, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_Y4 */ -+#ifdef V4L2_PIX_FMT_Y6 -+ { -+ .name = "6 bpp Greyscale", -+ .fourcc = V4L2_PIX_FMT_Y6, -+ .depth = 6, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_Y6 */ -+#ifdef V4L2_PIX_FMT_Y10 -+ { -+ .name = "10 bpp Greyscale", -+ .fourcc = V4L2_PIX_FMT_Y10, -+ .depth = 10, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_Y10 */ -+#ifdef V4L2_PIX_FMT_Y12 -+ { -+ .name = "12 bpp Greyscale", -+ .fourcc = V4L2_PIX_FMT_Y12, -+ .depth = 12, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_Y12 */ -+ { -+ .name = "16 bpp, Greyscale", -+ .fourcc = V4L2_PIX_FMT_Y16, -+ .depth = 16, -+ .flags = 0, -+ }, -+#ifdef V4L2_PIX_FMT_YUV444 -+ { -+ .name = "16 bpp xxxxyyyy uuuuvvvv", -+ .fourcc = V4L2_PIX_FMT_YUV444, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_YUV444 */ -+#ifdef V4L2_PIX_FMT_YUV555 -+ { -+ .name = "16 bpp YUV-5-5-5", -+ .fourcc = V4L2_PIX_FMT_YUV555, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_YUV555 */ -+#ifdef V4L2_PIX_FMT_YUV565 -+ { -+ .name = "16 bpp YUV-5-6-5", -+ .fourcc = V4L2_PIX_FMT_YUV565, -+ .depth = 16, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_YUV565 */ -+ -+/* bayer formats */ -+#ifdef V4L2_PIX_FMT_SRGGB8 -+ { -+ .name = "Bayer RGGB 8bit", -+ .fourcc = V4L2_PIX_FMT_SRGGB8, -+ .depth = 8, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_SRGGB8 */ -+#ifdef V4L2_PIX_FMT_SGRBG8 -+ { -+ .name = "Bayer GRBG 8bit", -+ .fourcc = V4L2_PIX_FMT_SGRBG8, -+ .depth = 8, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_SGRBG8 */ -+#ifdef V4L2_PIX_FMT_SGBRG8 -+ { -+ .name = "Bayer GBRG 8bit", -+ .fourcc = V4L2_PIX_FMT_SGBRG8, -+ .depth = 8, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_SGBRG8 */ -+#ifdef V4L2_PIX_FMT_SBGGR8 -+ { -+ .name = "Bayer BA81 8bit", -+ .fourcc = V4L2_PIX_FMT_SBGGR8, -+ .depth = 8, -+ .flags = 0, -+ }, -+#endif /* V4L2_PIX_FMT_SBGGR8 */ -+ -+ /* here come the planar formats */ -+ { -+ .name = "4:1:0, planar, Y-Cr-Cb", -+ .fourcc = V4L2_PIX_FMT_YVU410, -+ .depth = 9, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+ { -+ .name = "4:2:0, planar, Y-Cr-Cb", -+ .fourcc = V4L2_PIX_FMT_YVU420, -+ .depth = 12, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+ { -+ .name = "4:1:0, planar, Y-Cb-Cr", -+ .fourcc = V4L2_PIX_FMT_YUV410, -+ .depth = 9, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+ { -+ .name = "4:2:0, planar, Y-Cb-Cr", -+ .fourcc = V4L2_PIX_FMT_YUV420, -+ .depth = 12, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+#ifdef V4L2_PIX_FMT_YUV422P -+ { -+ .name = "16 bpp YVU422 planar", -+ .fourcc = V4L2_PIX_FMT_YUV422P, -+ .depth = 16, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+#endif /* V4L2_PIX_FMT_YUV422P */ -+#ifdef V4L2_PIX_FMT_YUV411P -+ { -+ .name = "16 bpp YVU411 planar", -+ .fourcc = V4L2_PIX_FMT_YUV411P, -+ .depth = 16, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+#endif /* V4L2_PIX_FMT_YUV411P */ -+#ifdef V4L2_PIX_FMT_Y41P -+ { -+ .name = "12 bpp YUV 4:1:1", -+ .fourcc = V4L2_PIX_FMT_Y41P, -+ .depth = 12, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+#endif /* V4L2_PIX_FMT_Y41P */ -+#ifdef V4L2_PIX_FMT_NV12 -+ { -+ .name = "12 bpp Y/CbCr 4:2:0 ", -+ .fourcc = V4L2_PIX_FMT_NV12, -+ .depth = 12, -+ .flags = FORMAT_FLAGS_PLANAR, -+ }, -+#endif /* V4L2_PIX_FMT_NV12 */ -+ -+/* here come the compressed formats */ -+ -+#ifdef V4L2_PIX_FMT_MJPEG -+ { -+ .name = "Motion-JPEG", -+ .fourcc = V4L2_PIX_FMT_MJPEG, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_MJPEG */ -+#ifdef V4L2_PIX_FMT_JPEG -+ { -+ .name = "JFIF JPEG", -+ .fourcc = V4L2_PIX_FMT_JPEG, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_JPEG */ -+#ifdef V4L2_PIX_FMT_DV -+ { -+ .name = "DV1394", -+ .fourcc = V4L2_PIX_FMT_DV, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_DV */ -+#ifdef V4L2_PIX_FMT_MPEG -+ { -+ .name = "MPEG-1/2/4 Multiplexed", -+ .fourcc = V4L2_PIX_FMT_MPEG, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_MPEG */ -+#ifdef V4L2_PIX_FMT_H264 -+ { -+ .name = "H264 with start codes", -+ .fourcc = V4L2_PIX_FMT_H264, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_H264 */ -+#ifdef V4L2_PIX_FMT_H264_NO_SC -+ { -+ .name = "H264 without start codes", -+ .fourcc = V4L2_PIX_FMT_H264_NO_SC, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_H264_NO_SC */ -+#ifdef V4L2_PIX_FMT_H264_MVC -+ { -+ .name = "H264 MVC", -+ .fourcc = V4L2_PIX_FMT_H264_MVC, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_H264_MVC */ -+#ifdef V4L2_PIX_FMT_H263 -+ { -+ .name = "H263", -+ .fourcc = V4L2_PIX_FMT_H263, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_H263 */ -+#ifdef V4L2_PIX_FMT_MPEG1 -+ { -+ .name = "MPEG-1 ES", -+ .fourcc = V4L2_PIX_FMT_MPEG1, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_MPEG1 */ -+#ifdef V4L2_PIX_FMT_MPEG2 -+ { -+ .name = "MPEG-2 ES", -+ .fourcc = V4L2_PIX_FMT_MPEG2, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_MPEG2 */ -+#ifdef V4L2_PIX_FMT_MPEG4 -+ { -+ .name = "MPEG-4 part 2 ES", -+ .fourcc = V4L2_PIX_FMT_MPEG4, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_MPEG4 */ -+#ifdef V4L2_PIX_FMT_XVID -+ { -+ .name = "Xvid", -+ .fourcc = V4L2_PIX_FMT_XVID, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_XVID */ -+#ifdef V4L2_PIX_FMT_VC1_ANNEX_G -+ { -+ .name = "SMPTE 421M Annex G compliant stream", -+ .fourcc = V4L2_PIX_FMT_VC1_ANNEX_G, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_VC1_ANNEX_G */ -+#ifdef V4L2_PIX_FMT_VC1_ANNEX_L -+ { -+ .name = "SMPTE 421M Annex L compliant stream", -+ .fourcc = V4L2_PIX_FMT_VC1_ANNEX_L, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_VC1_ANNEX_L */ -+#ifdef V4L2_PIX_FMT_VP8 -+ { -+ .name = "VP8", -+ .fourcc = V4L2_PIX_FMT_VP8, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_VP8 */ -+#ifdef V4L2_PIX_FMT_VP9 -+ { -+ .name = "VP9", -+ .fourcc = V4L2_PIX_FMT_VP9, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_VP9 */ -+#ifdef V4L2_PIX_FMT_HEVC -+ { -+ .name = "HEVC", -+ .fourcc = V4L2_PIX_FMT_HEVC, -+ .depth = 32, -+ .flags = FORMAT_FLAGS_COMPRESSED, -+ }, -+#endif /* V4L2_PIX_FMT_HEVC */ -+};