% \iffalse % %% File l3sort.dtx % % Copyright (C) 2012-2024 The LaTeX Project % % It may be distributed and/or modified under the conditions of the % LaTeX Project Public License (LPPL), either version 1.3c of this % license or (at your option) any later version. The latest version % of this license is in the file % % https://www.latex-project.org/lppl.txt % % This file is part of the "l3kernel bundle" (The Work in LPPL) % and all files in that bundle must be distributed together. % % ----------------------------------------------------------------------- % % The development version of the bundle can be found at % % https://github.com/latex3/latex3 % % for those people who are interested. % %<*driver> \documentclass[full,kernel]{l3doc} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % % \title{^^A % The \pkg{l3sort} module\\ Sorting functions^^A % } % % \author{^^A % The \LaTeX{} Project\thanks % {^^A % E-mail: % \href{mailto:latex-team@latex-project.org} % {latex-team@latex-project.org}^^A % }^^A % } % % \date{Released 2024-03-14} % % \maketitle % % \begin{documentation} % % \section{Controlling sorting} % % \label{sec:l3sort:mech} % % \LaTeX3 comes with a facility to sort list variables (sequences, % token lists, or comma-lists) according to some user-defined % comparison. For instance, % \begin{verbatim} % \clist_set:Nn \l_foo_clist { 3 , 01 , -2 , 5 , +1 } % \clist_sort:Nn \l_foo_clist % { % \int_compare:nNnTF { #1 } > { #2 } % { \sort_return_swapped: } % { \sort_return_same: } % } % \end{verbatim} % results in \cs[no-index]{l_foo_clist} holding the values % |{ -2 , 01 , +1 , 3 , 5 }| sorted in non-decreasing order. % % The code defining the comparison should call % \cs{sort_return_swapped:} if the two items given as |#1| % and |#2| are not in the correct order, and otherwise it % should call \cs{sort_return_same:} to indicate that % the order of this pair of items should not be changed. % % For instance, a \meta{comparison code} consisting only % of \cs{sort_return_same:} with no test yields a trivial % sort: the final order is identical to the original order. % Conversely, using a \meta{comparison code} consisting only % of \cs{sort_return_swapped:} reverses the list (in a fairly % inefficient way). % % \begin{texnote} % The current implementation is limited to sorting approximately % $20000$ items ($40000$ in \LuaTeX{}), depending on what other % packages are loaded. % % Internally, the code from \pkg{l3sort} stores items in \tn{toks} % registers allocated locally. Thus, the \meta{comparison code} % should not call \tn{newtoks} or other commands that allocate new % \tn{toks} registers. On the other hand, altering the value of a % previously allocated \tn{toks} register is not a problem. % \end{texnote} % % \begin{function}[added = 2017-02-06]{\sort_return_same:, \sort_return_swapped:} % \begin{syntax} % \cs{seq_sort:Nn} \meta{seq~var} % ~~|{| \ldots{} \cs{sort_return_same:} or \cs{sort_return_swapped:} \ldots{} |}| % \end{syntax} % Indicates whether to keep the order or swap the order of two items % that are compared in the sorting code. Only one of the % \cs[no-index]{sort_return_\ldots{}} functions should be used by the % code, according to the results of some tests on the items |#1| and % |#2| to be compared. % \end{function} % % \end{documentation} % % \begin{implementation} % % \section{\pkg{l3sort} implementation} % % \begin{macrocode} %<*package> % \end{macrocode} % % \begin{macrocode} %<@@=sort> % \end{macrocode} % % \subsection{Variables} % % \begin{variable}{\g_@@_internal_seq, \g_@@_internal_tl} % Sorting happens in a group; the result is stored in those global % variables before being copied outside the group to the proper % places. For seq and tl this is more efficient than using \cs{use:e} % (or some \cs{exp_args:NNNe}) to smuggle the definition outside the % group since \TeX{} does not need to re-read tokens. For clist we % don't gain anything since the result is converted from seq to clist % anyways. % \begin{macrocode} \seq_new:N \g_@@_internal_seq \tl_new:N \g_@@_internal_tl % \end{macrocode} % \end{variable} % % \begin{variable} % { % \l_@@_length_int, \l_@@_min_int, \l_@@_top_int, \l_@@_max_int, % \l_@@_true_max_int % } % The sequence has \cs{l_@@_length_int} items and is stored from % \cs{l_@@_min_int} to $\cs{l_@@_top_int}-1$. While reading the % sequence in memory, we check that \cs{l_@@_top_int} remains at most % \cs{l_@@_max_int}, precomputed by \cs{@@_compute_range:}. That % bound is such that the merge sort only uses \tn{toks} registers % less than \cs{l_@@_true_max_int}, namely those that have not been % allocated for use in other code: the user's comparison code could % alter these. % \begin{macrocode} \int_new:N \l_@@_length_int \int_new:N \l_@@_min_int \int_new:N \l_@@_top_int \int_new:N \l_@@_max_int \int_new:N \l_@@_true_max_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_block_int} % Merge sort is done in several passes. In each pass, blocks of size % \cs{l_@@_block_int} are merged in pairs. The block size starts % at $1$, and, for a length in the range $[2^k+1,2^{k+1}]$, reaches % $2^{k}$ in the last pass. % \begin{macrocode} \int_new:N \l_@@_block_int % \end{macrocode} % \end{variable} % % \begin{variable}{\l_@@_begin_int} % \begin{variable}{\l_@@_end_int} % When merging two blocks, \cs{l_@@_begin_int} marks the lowest % index in the two blocks, and \cs{l_@@_end_int} marks the % highest index, plus $1$. % \begin{macrocode} \int_new:N \l_@@_begin_int \int_new:N \l_@@_end_int % \end{macrocode} % \end{variable} % \end{variable} % % \begin{variable}{\l_@@_A_int} % \begin{variable}{\l_@@_B_int} % \begin{variable}{\l_@@_C_int} % When merging two blocks (whose end-points are \texttt{beg} % and \texttt{end}), $A$ starts from the high end of the low % block, and decreases until reaching \texttt{beg}. The index % $B$ starts from the top of the range and marks the register % in which a sorted item should be put. Finally, $C$ points % to the copy of the high block in the interval of registers % starting at \cs{l_@@_length_int}, upwards. $C$ starts % from the upper limit of that range. % \begin{macrocode} \int_new:N \l_@@_A_int \int_new:N \l_@@_B_int \int_new:N \l_@@_C_int % \end{macrocode} % \end{variable} % \end{variable} % \end{variable} % % \begin{variable}{\s_@@_mark,\s_@@_stop} % Internal scan marks. % \begin{macrocode} \scan_new:N \s_@@_mark \scan_new:N \s_@@_stop % \end{macrocode} % \end{variable} % % \subsection{Finding available \tn{toks} registers} % % \begin{macro}{\@@_shrink_range:} % \begin{macro}{\@@_shrink_range_loop:} % After \cs{@@_compute_range:} (defined below) determines that % \tn{toks} registers between \cs{l_@@_min_int} (included) and % \cs{l_@@_true_max_int} (excluded) have not yet been assigned, % \cs{@@_shrink_range:} computes \cs{l_@@_max_int} to reflect the need % for a buffer when merging blocks in the merge sort. Given % $2^{n}\leq A\leq 2^{n}+2^{n-1}$ registers we can sort $\lfloor % A/2\rfloor+2^{n-2}$ items while if we have $2^{n}+2^{n-1}\leq A\leq % 2^{n+1}$ registers we can sort $A-2^{n-1}$ items. We first find out % a power $2^{n}$ such that $2^{n}\leq A\leq 2^{n+1}$ by repeatedly % halving \cs{l_@@_block_int}, starting at $2^{15}$ or $2^{14}$ namely % half the total number of registers, then we use the formulas and set % \cs{l_@@_max_int}. % \begin{macrocode} \cs_new_protected:Npn \@@_shrink_range: { \int_set:Nn \l_@@_A_int { \l_@@_true_max_int - \l_@@_min_int + 1 } \int_set:Nn \l_@@_block_int { \c_max_register_int / 2 } \@@_shrink_range_loop: \int_set:Nn \l_@@_max_int { \int_compare:nNnTF { \l_@@_block_int * 3 / 2 } > \l_@@_A_int { \l_@@_min_int + ( \l_@@_A_int - 1 ) / 2 + \l_@@_block_int / 4 - 1 } { \l_@@_true_max_int - \l_@@_block_int / 2 } } } \cs_new_protected:Npn \@@_shrink_range_loop: { \if_int_compare:w \l_@@_A_int < \l_@@_block_int \tex_divide:D \l_@@_block_int 2 \exp_stop_f: \exp_after:wN \@@_shrink_range_loop: \fi: } % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}{\@@_compute_range:, \@@_redefine_compute_range:} % \begin{variable}{\c_@@_max_length_int} % First find out what \tn{toks} have not yet been assigned. There are % many cases. In \LaTeXe{} with no package, available \tn{toks} range % from $\tn{count}15+1$ to \cs{c_max_register_int} included (this was % not altered despite the 2015 changes). When \tn{loctoks} is % defined, namely in plain (e)\TeX{}, or when the package \pkg{etex} % is loaded in \LaTeXe{}, redefine \cs{@@_compute_range:} to use the % range $\tn{count}265$ to $\tn{count}275-1$. The \pkg{elocalloc} % package also defines \tn{loctoks} but uses yet another number for % the upper bound, namely \cs{e@alloc@top} (minus one). We must check % for \tn{loctoks} every time a sorting function is called, as % \pkg{etex} or \pkg{elocalloc} could be loaded. % % In \ConTeXt{} MkIV the range is from % $|\c_syst_last_allocated_toks|+1$ to \cs{c_max_register_int}, and in % MkII it is from $|\lastallocatedtoks|+1$ to \cs{c_max_register_int}. % In all these cases, call \cs{@@_shrink_range:}. % \begin{macrocode} \cs_new_protected:Npn \@@_compute_range: { \int_set:Nn \l_@@_min_int { \tex_count:D 15 + 1 } \int_set:Nn \l_@@_true_max_int { \c_max_register_int + 1 } \@@_shrink_range: \if_meaning:w \loctoks \tex_undefined:D \else: \if_meaning:w \loctoks \scan_stop: \else: \@@_redefine_compute_range: \@@_compute_range: \fi: \fi: } \cs_new_protected:Npn \@@_redefine_compute_range: { \cs_if_exist:cTF { ver@elocalloc.sty } { \cs_gset_protected:Npn \@@_compute_range: { \int_set:Nn \l_@@_min_int { \tex_count:D 265 } \int_set_eq:NN \l_@@_true_max_int \e@alloc@top \@@_shrink_range: } } { \cs_gset_protected:Npn \@@_compute_range: { \int_set:Nn \l_@@_min_int { \tex_count:D 265 } \int_set:Nn \l_@@_true_max_int { \tex_count:D 275 } \@@_shrink_range: } } } \cs_if_exist:NT \loctoks { \@@_redefine_compute_range: } \tl_map_inline:nn { \lastallocatedtoks \c_syst_last_allocated_toks } { \cs_if_exist:NT #1 { \cs_gset_protected:Npn \@@_compute_range: { \int_set:Nn \l_@@_min_int { #1 + 1 } \int_set:Nn \l_@@_true_max_int { \c_max_register_int + 1 } \@@_shrink_range: } } } % \end{macrocode} % \end{variable} % \end{macro} % % \subsection{Protected user commands} % % \begin{macro}{\@@_main:NNNn} % Sorting happens in three steps. First store items in \tn{toks} % registers ranging from \cs{l_@@_min_int} to $\cs{l_@@_top_int}-1$, % while checking that the list is not too long. If we reach the % maximum length, that's an error; exit the group. Secondly, sort the % array of \tn{toks} registers, using the user-defined sorting % function: \cs{@@_level:} calls \cs{@@_compare:nn} as needed. % Finally, unpack the \tn{toks} registers (now sorted) into the target % tl, or into \cs{g_@@_internal_seq} for seq and clist. This is done % by \cs{@@_seq:NNNNn} and \cs{@@_tl:NNn}. % \begin{macrocode} \cs_new_protected:Npn \@@_main:NNNn #1#2#3#4 { \@@_disable_toksdef: \@@_compute_range: \int_set_eq:NN \l_@@_top_int \l_@@_min_int #1 #3 { \if_int_compare:w \l_@@_top_int = \l_@@_max_int \@@_too_long_error:NNw #2 #3 \fi: \tex_toks:D \l_@@_top_int {##1} \int_incr:N \l_@@_top_int } \int_set:Nn \l_@@_length_int { \l_@@_top_int - \l_@@_min_int } \cs_set:Npn \@@_compare:nn ##1 ##2 {#4} \int_set:Nn \l_@@_block_int { 1 } \@@_level: } % \end{macrocode} % \end{macro} % % \begin{macro}{\tl_sort:Nn, \tl_sort:cn, \tl_gsort:Nn, \tl_gsort:cn} % \begin{macro}{\@@_tl:NNn} % \begin{macro}[EXP]{\@@_tl_toks:w} % Call the main sorting function then unpack \tn{toks} registers % outside the group into the target token list. The unpacking is done % by \cs{@@_tl_toks:w}; registers are numbered from \cs{l_@@_min_int} % to $\cs{l_@@_top_int}-1$. For expansion behaviour we need a couple % of primitives. The \cs{tl_gclear:N} reduces memory usage. The % \cs{prg_break_point:} is used by \cs{@@_main:NNNn} when the list is % too long. % \begin{macrocode} \cs_new_protected:Npn \tl_sort:Nn { \@@_tl:NNn \tl_set_eq:NN } \cs_generate_variant:Nn \tl_sort:Nn { c } \cs_new_protected:Npn \tl_gsort:Nn { \@@_tl:NNn \tl_gset_eq:NN } \cs_generate_variant:Nn \tl_gsort:Nn { c } \cs_new_protected:Npn \@@_tl:NNn #1#2#3 { \group_begin: \@@_main:NNNn \tl_map_inline:Nn \tl_map_break:n #2 {#3} \__kernel_tl_gset:Nx \g_@@_internal_tl { \@@_tl_toks:w \l_@@_min_int ; } \group_end: #1 #2 \g_@@_internal_tl \tl_gclear:N \g_@@_internal_tl \prg_break_point: } \cs_new:Npn \@@_tl_toks:w #1 ; { \if_int_compare:w #1 < \l_@@_top_int { \tex_the:D \tex_toks:D #1 } \exp_after:wN \@@_tl_toks:w \int_value:w \int_eval:n { #1 + 1 } \exp_after:wN ; \fi: } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}{\seq_sort:Nn, \seq_sort:cn, \seq_gsort:Nn, \seq_gsort:cn} % \begin{macro}{\clist_sort:Nn, \clist_sort:cn, \clist_gsort:Nn, \clist_gsort:cn} % \begin{macro}{\@@_seq:NNNNn} % Use the same general framework for seq and clist. Apply the general % sorting code, then unpack \tn{toks} into \cs{g_@@_internal_seq}. % Outside the group copy or convert (for clist) the data to the target % variable. The \cs{seq_gclear:N} reduces memory usage. The % \cs{prg_break_point:} is used by \cs{@@_main:NNNn} when the list is % too long. % \begin{macrocode} \cs_new_protected:Npn \seq_sort:Nn { \@@_seq:NNNNn \seq_map_inline:Nn \seq_map_break:n \seq_set_eq:NN } \cs_generate_variant:Nn \seq_sort:Nn { c } \cs_new_protected:Npn \seq_gsort:Nn { \@@_seq:NNNNn \seq_map_inline:Nn \seq_map_break:n \seq_gset_eq:NN } \cs_generate_variant:Nn \seq_gsort:Nn { c } \cs_new_protected:Npn \clist_sort:Nn { \@@_seq:NNNNn \clist_map_inline:Nn \clist_map_break:n \clist_set_from_seq:NN } \cs_generate_variant:Nn \clist_sort:Nn { c } \cs_new_protected:Npn \clist_gsort:Nn { \@@_seq:NNNNn \clist_map_inline:Nn \clist_map_break:n \clist_gset_from_seq:NN } \cs_generate_variant:Nn \clist_gsort:Nn { c } \cs_new_protected:Npn \@@_seq:NNNNn #1#2#3#4#5 { \group_begin: \@@_main:NNNn #1 #2 #4 {#5} \seq_gclear:N \g_@@_internal_seq \int_step_inline:nnn \l_@@_min_int { \l_@@_top_int - 1 } { \seq_gput_right:Ne \g_@@_internal_seq { \tex_the:D \tex_toks:D ##1 } } \group_end: #3 #4 \g_@@_internal_seq \seq_gclear:N \g_@@_internal_seq \prg_break_point: } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \subsection{Merge sort} % % \begin{macro}{\@@_level:} % This function is called once blocks of size \cs{l_@@_block_int} % (initially $1$) are each sorted. If the whole list fits in one % block, then we are done (this also takes care of the case of an % empty list or a list with one item). Otherwise, go through pairs % of blocks starting from $0$, then double the block size, and repeat. % \begin{macrocode} \cs_new_protected:Npn \@@_level: { \if_int_compare:w \l_@@_block_int < \l_@@_length_int \l_@@_end_int \l_@@_min_int \@@_merge_blocks: \tex_advance:D \l_@@_block_int \l_@@_block_int \exp_after:wN \@@_level: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_merge_blocks:} % This function is called to merge a pair of blocks, starting at % the last value of \cs{l_@@_end_int} (end-point of the previous % pair of blocks). If shifting by one block to the right we reach % the end of the list, then this pass has ended: the end of the % list is sorted already. Otherwise, store the result of that shift in $A$, % which indexes the first block starting from the top end. % Then locate the end-point (maximum) of the second block: shift % \texttt{end} upwards by one more block, but keeping it % $\leq\texttt{top}$. Copy this upper block of \tn{toks} % registers in registers above \texttt{length}, indexed by $C$: % this is covered by \cs{@@_copy_block:}. Once this is done we % are ready to do the actual merger using \cs{@@_merge_blocks_aux:}, % after shifting $A$, $B$ and $C$ so that they point to the largest % index in their respective ranges rather than pointing just beyond % those ranges. Of course, once that pair of blocks is merged, % move on to the next pair. % \begin{macrocode} \cs_new_protected:Npn \@@_merge_blocks: { \l_@@_begin_int \l_@@_end_int \tex_advance:D \l_@@_end_int \l_@@_block_int \if_int_compare:w \l_@@_end_int < \l_@@_top_int \l_@@_A_int \l_@@_end_int \tex_advance:D \l_@@_end_int \l_@@_block_int \if_int_compare:w \l_@@_end_int > \l_@@_top_int \l_@@_end_int \l_@@_top_int \fi: \l_@@_B_int \l_@@_A_int \l_@@_C_int \l_@@_top_int \@@_copy_block: \int_decr:N \l_@@_A_int \int_decr:N \l_@@_B_int \int_decr:N \l_@@_C_int \exp_after:wN \@@_merge_blocks_aux: \exp_after:wN \@@_merge_blocks: \fi: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_copy_block:} % We wish to store a copy of the \enquote{upper} block of % \tn{toks} registers, ranging between the initial value of % \cs{l_@@_B_int} (included) and \cs{l_@@_end_int} % (excluded) into a new range starting at the initial value % of \cs{l_@@_C_int}, namely \cs{l_@@_top_int}. % \begin{macrocode} \cs_new_protected:Npn \@@_copy_block: { \tex_toks:D \l_@@_C_int \tex_toks:D \l_@@_B_int \int_incr:N \l_@@_C_int \int_incr:N \l_@@_B_int \if_int_compare:w \l_@@_B_int = \l_@@_end_int \use_i:nn \fi: \@@_copy_block: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_merge_blocks_aux:} % At this stage, the first block starts at \cs{l_@@_begin_int}, % and ends at \cs{l_@@_A_int}, and the second block starts at % \cs{l_@@_top_int} and ends at \cs{l_@@_C_int}. The result % of the merger is stored at positions indexed by \cs{l_@@_B_int}, % which starts at $\cs{l_@@_end_int}-1$ and decreases down to % \cs{l_@@_begin_int}, covering the full range of the two blocks. % In other words, we are building the merger starting with the % largest values. % The comparison function is defined to return either % \texttt{swapped} or \texttt{same}. Of course, this % means the arguments need to be given in the order they % appear originally in the list. % \begin{macrocode} \cs_new_protected:Npn \@@_merge_blocks_aux: { \exp_after:wN \@@_compare:nn \exp_after:wN { \tex_the:D \tex_toks:D \exp_after:wN \l_@@_A_int \exp_after:wN } \exp_after:wN { \tex_the:D \tex_toks:D \l_@@_C_int } \prg_do_nothing: \@@_return_mark:w \@@_return_mark:w \s_@@_mark \@@_return_none_error: } % \end{macrocode} % \end{macro} % % \begin{macro}{\sort_return_same:, \sort_return_swapped:} % \begin{macro}{\@@_return_mark:w} % \begin{macro}{\@@_return_none_error:, \@@_return_two_error:} % Each comparison should call \cs{sort_return_same:} or % \cs{sort_return_swapped:} exactly once. If neither is called, % \cs{@@_return_none_error:} is called, since the \texttt{return_mark} % removes tokens until \cs{s_@@_mark}. If one is called, the % \texttt{return_mark} auxiliary removes everything except % \cs{@@_return_same:w} (or its \texttt{swapped} analogue) followed by % \cs{@@_return_none_error:}. Finally if two or more are called, % \cs{@@_return_two_error:} ends up before any \cs{@@_return_mark:w}, % so that it produces an error. % \begin{macrocode} \cs_new_protected:Npn \sort_return_same: #1 \@@_return_mark:w #2 \s_@@_mark { #1 #2 \@@_return_two_error: \@@_return_mark:w \s_@@_mark \@@_return_same:w } \cs_new_protected:Npn \sort_return_swapped: #1 \@@_return_mark:w #2 \s_@@_mark { #1 #2 \@@_return_two_error: \@@_return_mark:w \s_@@_mark \@@_return_swapped:w } \cs_new_protected:Npn \@@_return_mark:w #1 \s_@@_mark { } \cs_new_protected:Npn \@@_return_none_error: { \msg_error:nnee { sort } { return-none } { \tex_the:D \tex_toks:D \l_@@_A_int } { \tex_the:D \tex_toks:D \l_@@_C_int } \@@_return_same:w \@@_return_none_error: } \cs_new_protected:Npn \@@_return_two_error: { \msg_error:nnee { sort } { return-two } { \tex_the:D \tex_toks:D \l_@@_A_int } { \tex_the:D \tex_toks:D \l_@@_C_int } } % \end{macrocode} % \end{macro} % \end{macro} % \end{macro} % % \begin{macro}{\@@_return_same:w} % If the comparison function returns \texttt{same}, % then the second argument fed to \cs{@@_compare:nn} % should remain to the right of the other one. Since % we build the merger starting from the right, we copy % that \tn{toks} register into the allotted range, then % shift the pointers $B$ and $C$, and go on to do one % more step in the merger, unless the second block has % been exhausted: then the remainder of the first block % is already in the correct registers and we are done % with merging those two blocks. % \begin{macrocode} \cs_new_protected:Npn \@@_return_same:w #1 \@@_return_none_error: { \tex_toks:D \l_@@_B_int \tex_toks:D \l_@@_C_int \int_decr:N \l_@@_B_int \int_decr:N \l_@@_C_int \if_int_compare:w \l_@@_C_int < \l_@@_top_int \use_i:nn \fi: \@@_merge_blocks_aux: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_return_swapped:w} % If the comparison function returns \texttt{swapped}, % then the next item to add to the merger is the first % argument, contents of the \tn{toks} register $A$. % Then shift the pointers $A$ and $B$ to the left, and % go for one more step for the merger, unless the left % block was exhausted ($A$ goes below the threshold). % In that case, all remaining \tn{toks} registers in % the second block, indexed by $C$, are copied % to the merger by \cs{@@_merge_blocks_end:}. % \begin{macrocode} \cs_new_protected:Npn \@@_return_swapped:w #1 \@@_return_none_error: { \tex_toks:D \l_@@_B_int \tex_toks:D \l_@@_A_int \int_decr:N \l_@@_B_int \int_decr:N \l_@@_A_int \if_int_compare:w \l_@@_A_int < \l_@@_begin_int \@@_merge_blocks_end: \use_i:nn \fi: \@@_merge_blocks_aux: } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_merge_blocks_end:} % This function's task is to copy the \tn{toks} registers % in the block indexed by $C$ to the merger indexed by $B$. % The end can equally be detected by checking when $B$ reaches % the threshold \texttt{begin}, or when $C$ reaches % \texttt{top}. % \begin{macrocode} \cs_new_protected:Npn \@@_merge_blocks_end: { \tex_toks:D \l_@@_B_int \tex_toks:D \l_@@_C_int \int_decr:N \l_@@_B_int \int_decr:N \l_@@_C_int \if_int_compare:w \l_@@_B_int < \l_@@_begin_int \use_i:nn \fi: \@@_merge_blocks_end: } % \end{macrocode} % \end{macro} % % \subsection{Expandable sorting} % % Sorting expandably is very different from sorting and assigning to a % variable. Since tokens cannot be stored, they must remain in the % input stream, and be read through at every step. It is thus % necessarily much slower (at best $O(n^2\ln n)$) than non-expandable % sorting functions ($O(n\ln n)$). % % A prototypical version of expandable quicksort is as follows. If the % argument has no item, return nothing, otherwise partition, using the % first item as a pivot (argument |#4| of \cs{@@:nnNnn}). The % arguments of \cs{@@:nnNnn} are 1.~items less than |#4|, 2.~items % greater or equal to |#4|, 3.~comparison, 4.~pivot, 5.~next item to % test. If |#5| is the tail of the list, call \cs{tl_sort:nN} on |#1| % and on |#2|, placing |#4| in between; |\use:ff| expands the parts to % make \cs{tl_sort:nN} \texttt{f}-expandable. Otherwise, compare |#4| % and |#5| using |#3|. If they are ordered, place |#5| amongst the % \enquote{greater} items, otherwise amongst the \enquote{lesser} items, % and continue partitioning. % \begin{verbatim} % \cs_new:Npn \tl_sort:nN #1#2 % { % \tl_if_blank:nF {#1} % { % \__sort:nnNnn { } { } #2 % #1 \q__sort_recursion_tail \q__sort_recursion_stop % } % } % \cs_new:Npn \__sort:nnNnn #1#2#3#4#5 % { % \quark_if_recursion_tail_stop_do:nn {#5} % { \use:ff { \tl_sort:nN {#1} #3 {#4} } { \tl_sort:nN {#2} #3 } } % #3 {#4} {#5} % { \__sort:nnNnn {#1} { #2 {#5} } #3 {#4} } % { \__sort:nnNnn { #1 {#5} } {#2} #3 {#4} } % } % \cs_generate_variant:Nn \use:nn { ff } % \end{verbatim} % There are quite a few optimizations available here: the code below is % less legible, but more than twice as fast. % % In the simple version of the code, \cs{@@:nnNnn} is called % $O(n\ln n)$ times on average (the number of comparisons required by % the quicksort algorithm). Hence most of our focus is on % optimizing that function. % % The first speed up is to avoid testing for the end of the list at % every call to \cs{@@:nnNnn}. For this, the list is prepared by % changing each \meta{item} of the original token list into % \meta{command} \Arg{item}, just like sequences are stored. We arrange % things such that the \meta{command} is the \meta{conditional} provided % by the user: the loop over the \meta{prepared tokens} then looks like % \begin{quote} % \ttfamily % \cs{cs_new:Npn}~\cs{@@_loop:wNn}~\ldots{}~|#6#7|\\ % ~~|{|\\ % ~~~~|#6|~\Arg{pivot}~|{#7}|~\meta{loop big}~\meta{loop small}\\ % ~~~~~~\meta{extra arguments}\\ % ~~|}|\\ % \cs{@@_loop:wNn}~\ldots{}~\meta{prepared tokens}\\ % ~~\meta{end-loop}~|{}|~\cs{s_@@_stop} % \end{quote} % In this example, which matches the structure of % \cs{@@_quick_split_i:NnnnnNn} and a few other functions below, the % \cs{@@_loop:wNn} auxiliary normally receives the user's % \meta{conditional} as~|#6| and an \meta{item} as~|#7|. This is % compared to the \meta{pivot} (the argument~|#5|, not shown here), and % the \meta{conditional} leaves the \meta{loop big} or \meta{loop small} % auxiliary, which both have the same form as \cs{@@_loop:wNn}, % receiving the next pair \meta{conditional} \Arg{item} as |#6| % and~|#7|. At the end, |#6| is the \meta{end-loop} function, which % terminates the loop. % % The second speed up is to minimize the duplicated tokens between the % \texttt{true} and \texttt{false} branches of the conditional. For % this, we introduce two versions of \cs{@@:nnNnn}, which receive % the new item as~|#1| and place it either into the list~|#2| of items % less than the pivot~|#4| or into the list~|#3| of items greater or % equal to the pivot. % \begin{verbatim} % \cs_new:Npn \__sort_i:nnnnNn #1#2#3#4#5#6 % { % #5 {#4} {#6} \__sort_ii:nnnnNn \__sort_i:nnnnNn % {#6} { #2 {#1} } {#3} {#4} % } % \cs_new:Npn \__sort_ii:nnnnNn #1#2#3#4#5#6 % { % #5 {#4} {#6} \__sort_ii:nnnnNn \__sort_i:nnnnNn % {#6} {#2} { #3 {#1} } {#4} % } % \end{verbatim} % Note that the two functions have the form of \cs{@@_loop:wNn} above, % receiving as~|#5| the conditional or a function to end the loop. In % fact, the lists~|#2| and~|#3| must be made of pairs \meta{conditional} % \Arg{item}, so we have to replace~|{#6}| above by |{|~|#5|~|{#6}|~|}|, % and |{#1}|~by~|#1|. The actual functions have one more argument, so % all argument numbers are shifted compared to this code. % % The third speed up is to avoid |\use:ff| using a continuation-passing % style: \cs{@@_quick_split:NnNn} expects a list followed by % \cs{s_@@_mark} \Arg{code}, and expands to \meta{code} \meta{sorted list}. % Sorting the two parts of the list around the pivot is done with % \begin{quote} % \ttfamily % \cs{@@_quick_split:NnNn} |#2| \ldots{} \cs{s_@@_mark}\\ % ~~|{|\\ % ~~~~\cs{@@_quick_split:NnNn} |#1| \ldots{} \cs{s_@@_mark} \Arg{code}\\ % ~~~~\Arg{pivot}\\ % ~~|}| % \end{quote} % Items which are larger than the \meta{pivot} are sorted, then placed % after code that sorts the smaller items, and after the (braced) % \meta{pivot}. % % The fourth speed up is avoid the recursive call to \cs{tl_sort:nN} % with an empty first argument. For this, we introduce functions % similar to the \cs{@@_i:nnnnNn} of the last example, but aware of % whether the list of \meta{conditional} \Arg{item} read so far that are % less than the pivot, and the list of those greater or equal, are empty % or not: see \cs{@@_quick_split:NnNn} and functions defined below. % Knowing whether the lists are empty or not is useless if we do not use % distinct ending codes as appropriate. The splitting auxiliaries % communicate to the \meta{end-loop} function (that is initially placed % after the ``prepared'' list) by placing a specific ending function, % ignored when looping, but useful at the end. In fact, the % \meta{end-loop} function does nothing but place the appropriate ending % function in front of all its arguments. The ending functions take % care of sorting non-empty sublists, placing the pivot in between, and % the continuation before. % % The final change in fact slows down the code a little, but is required % to avoid memory issues: schematically, when \TeX{} encounters % \begin{verbatim} % \use:n { \use:n { \use:n { ... } ... } ... } % \end{verbatim} % the argument of the first \cs{use:n} is not completely read by the % second \cs{use:n}, hence must remain in memory; then the argument of % the second \cs{use:n} is not completely read when grabbing the % argument of the third \cs{use:n}, hence must remain in memory, and so % on. The memory consumption grows quadratically with the number of % nested \cs{use:n}. In practice, this means that we must read % everything until a trailing \cs{s_@@_stop} once in a while, otherwise % sorting lists of more than a few thousand items would exhaust a % typical \TeX{}'s memory. % % \begin{macro}[EXP]{\tl_sort:nN} % \begin{macro}[EXP] % { % \@@_quick_prepare:Nnnn, % \@@_quick_prepare_end:NNNnw, % \@@_quick_cleanup:w % } % The code within the \cs{exp_not:f} sorts the list, leaving in most % cases a leading \cs{exp_not:f}, which stops the expansion, letting % the result be return within \cs{exp_not:n}. We filter out the case % of a list with no item, which would otherwise cause problems. Then % prepare the token list~|#1| by inserting the conditional~|#2| before % each item. The \texttt{prepare} auxiliary receives the conditional % as~|#1|, the prepared token list so far as~|#2|, the next prepared % item as~|#3|, and the item after that as~|#4|. The loop ends % when~|#4| contains \cs{prg_break_point:}, then the % \texttt{prepare_end} auxiliary finds the prepared token list % as~|#4|. The scene is then set up for \cs{@@_quick_split:NnNn}, % which sorts the prepared list and perform the post action placed % after \cs{s_@@_mark}, namely removing the trailing \cs{s_@@_stop} and % \cs{s_@@_stop} and leaving \cs{exp_stop_f:} to stop % \texttt{f}-expansion. % \begin{macrocode} \cs_new:Npn \tl_sort:nN #1#2 { \exp_not:f { \tl_if_blank:nF {#1} { \@@_quick_prepare:Nnnn #2 { } { } #1 { \prg_break_point: \@@_quick_prepare_end:NNNnw } \s_@@_stop } } } \cs_new:Npn \@@_quick_prepare:Nnnn #1#2#3#4 { \prg_break: #4 \prg_break_point: \@@_quick_prepare:Nnnn #1 { #2 #3 } { #1 {#4} } } \cs_new:Npn \@@_quick_prepare_end:NNNnw #1#2#3#4#5 \s_@@_stop { \@@_quick_split:NnNn #4 \@@_quick_end:nnTFNn { } \s_@@_mark { \@@_quick_cleanup:w \exp_stop_f: } \s_@@_mark \s_@@_stop } \cs_new:Npn \@@_quick_cleanup:w #1 \s_@@_mark \s_@@_stop {#1} % \end{macrocode} % \end{macro} % \end{macro} % % \begin{macro}[EXP] % { % \@@_quick_split:NnNn, % \@@_quick_only_i:NnnnnNn, % \@@_quick_only_ii:NnnnnNn, % \@@_quick_split_i:NnnnnNn, % \@@_quick_split_ii:NnnnnNn % } % The \texttt{only_i}, \texttt{only_ii}, \texttt{split_i} and % \texttt{split_ii} auxiliaries receive a useless first argument, the % new item~|#2| (that they append to either one of the next two % arguments), the list~|#3| of items less than the pivot, bigger % items~|#4|, the pivot~|#5|, a \meta{function}~|#6|, and an % item~|#7|. The \meta{function} is the user's \meta{conditional} % except at the end of the list where it is % \cs{@@_quick_end:nnTFNn}. The comparison is applied to the % \meta{pivot} and the \meta{item}, and calls the \texttt{only_i} or % \texttt{split_i} auxiliaries if the \meta{item} is smaller, and the % \texttt{only_ii} or \texttt{split_ii} auxiliaries otherwise. In % both cases, the next auxiliary goes to work right away, with no % intermediate expansion that would slow down operations. Note that % the argument~|#2| left for the next call has the form % \meta{conditional} \Arg{item}, so that the lists~|#3| and~|#4| keep % the right form to be fed to the next sorting function. % The \texttt{split} auxiliary differs from these in that it is % missing three of the arguments, which would be empty, and its first % argument is always the user's \meta{conditional} rather than an % ending function. % \begin{macrocode} \cs_new:Npn \@@_quick_split:NnNn #1#2#3#4 { #3 {#2} {#4} \@@_quick_only_ii:NnnnnNn \@@_quick_only_i:NnnnnNn \@@_quick_single_end:nnnwnw { #3 {#4} } { } { } {#2} } \cs_new:Npn \@@_quick_only_i:NnnnnNn #1#2#3#4#5#6#7 { #6 {#5} {#7} \@@_quick_split_ii:NnnnnNn \@@_quick_only_i:NnnnnNn \@@_quick_only_i_end:nnnwnw { #6 {#7} } { #3 #2 } { } {#5} } \cs_new:Npn \@@_quick_only_ii:NnnnnNn #1#2#3#4#5#6#7 { #6 {#5} {#7} \@@_quick_only_ii:NnnnnNn \@@_quick_split_i:NnnnnNn \@@_quick_only_ii_end:nnnwnw { #6 {#7} } { } { #4 #2 } {#5} } \cs_new:Npn \@@_quick_split_i:NnnnnNn #1#2#3#4#5#6#7 { #6 {#5} {#7} \@@_quick_split_ii:NnnnnNn \@@_quick_split_i:NnnnnNn \@@_quick_split_end:nnnwnw { #6 {#7} } { #3 #2 } {#4} {#5} } \cs_new:Npn \@@_quick_split_ii:NnnnnNn #1#2#3#4#5#6#7 { #6 {#5} {#7} \@@_quick_split_ii:NnnnnNn \@@_quick_split_i:NnnnnNn \@@_quick_split_end:nnnwnw { #6 {#7} } {#3} { #4 #2 } {#5} } % \end{macrocode} % \end{macro} % % \begin{macro}[EXP] % { % \@@_quick_end:nnTFNn, % \@@_quick_single_end:nnnwnw, % \@@_quick_only_i_end:nnnwnw, % \@@_quick_only_ii_end:nnnwnw, % \@@_quick_split_end:nnnwnw, % } % The \cs{@@_quick_end:nnTFNn} appears instead of the user's % conditional, and receives as its arguments the pivot~|#1|, a fake % item~|#2|, a \texttt{true} and a \texttt{false} branches |#3| % and~|#4|, followed by an ending function~|#5| (one of the four % auxiliaries here) and another copy~|#6| of the fake item. All those % are discarded except the function~|#5|. This function receives % lists~|#1| and~|#2| of items less than or greater than the % pivot~|#3|, then a continuation code~|#5| just after \cs{s_@@_mark}. % To avoid a memory problem described earlier, all of the ending % functions read~|#6| until \cs{s_@@_stop} and place~|#6| back into the % input stream. When the lists |#1| and~|#2| are empty, the % \texttt{single} auxiliary simply places the continuation~|#5| before % the pivot~|{#3}|. When |#2|~is empty, |#1|~is sorted and placed % before the pivot~|{#3}|, taking care to feed the continuation~|#5| % as a continuation for the function sorting~|#1|. When |#1|~is % empty, |#2|~is sorted, and the continuation argument is used to % place the continuation~|#5| and the pivot~|{#3}| before the sorted % result. Finally, when both lists are non-empty, items larger than % the pivot are sorted, then items less than the pivot, and the % continuations are done in such a way to place the pivot in between. % \begin{macrocode} \cs_new:Npn \@@_quick_end:nnTFNn #1#2#3#4#5#6 {#5} \cs_new:Npn \@@_quick_single_end:nnnwnw #1#2#3#4 \s_@@_mark #5#6 \s_@@_stop { #5 {#3} #6 \s_@@_stop } \cs_new:Npn \@@_quick_only_i_end:nnnwnw #1#2#3#4 \s_@@_mark #5#6 \s_@@_stop { \@@_quick_split:NnNn #1 \@@_quick_end:nnTFNn { } \s_@@_mark {#5} {#3} #6 \s_@@_stop } \cs_new:Npn \@@_quick_only_ii_end:nnnwnw #1#2#3#4 \s_@@_mark #5#6 \s_@@_stop { \@@_quick_split:NnNn #2 \@@_quick_end:nnTFNn { } \s_@@_mark { #5 {#3} } #6 \s_@@_stop } \cs_new:Npn \@@_quick_split_end:nnnwnw #1#2#3#4 \s_@@_mark #5#6 \s_@@_stop { \@@_quick_split:NnNn #2 \@@_quick_end:nnTFNn { } \s_@@_mark { \@@_quick_split:NnNn #1 \@@_quick_end:nnTFNn { } \s_@@_mark {#5} {#3} } #6 \s_@@_stop } % \end{macrocode} % \end{macro} % % \subsection{Messages} % % \begin{macro}{\@@_error:} % Bailing out of the sorting code is a bit tricky. It may not be safe % to use a delimited argument, so instead we redefine many % \pkg{l3sort} commands to be trivial, with \cs{@@_level:} jumping to % the break point. This error recovery won't work in a group. % \begin{macrocode} \cs_new_protected:Npn \@@_error: { \cs_set_eq:NN \@@_merge_blocks_aux: \prg_do_nothing: \cs_set_eq:NN \@@_merge_blocks: \prg_do_nothing: \cs_set_protected:Npn \@@_level: { \group_end: \prg_break: } } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_disable_toksdef:, \@@_disabled_toksdef:n} % While sorting, \tn{toksdef} is locally disabled to prevent users % from using \tn{newtoks} or similar commands in their comparison % code: the \tn{toks} registers that would be assigned are in use by % \pkg{l3sort}. In format mode, none of this is needed since there is % no \tn{toks} allocator. % \begin{macrocode} \cs_new_protected:Npn \@@_disable_toksdef: { \cs_set_eq:NN \toksdef \@@_disabled_toksdef:n } \cs_new_protected:Npn \@@_disabled_toksdef:n #1 { \msg_error:nne { sort } { toksdef } { \token_to_str:N #1 } \@@_error: \tex_toksdef:D #1 } \msg_new:nnnn { sort } { toksdef } { Allocation~of~\iow_char:N\\toks~registers~impossible~while~sorting. } { The~comparison~code~used~for~sorting~a~list~has~attempted~to~ define~#1~as~a~new~\iow_char:N\\toks~register~using~ \iow_char:N\\newtoks~ or~a~similar~command.~The~list~will~not~be~sorted. } % \end{macrocode} % \end{macro} % % \begin{macro}{\@@_too_long_error:NNw} % When there are too many items in a sequence, this is an error, and % we clean up properly the mapping over items in the list: break using % the type-specific breaking function |#1|. % \begin{macrocode} \cs_new_protected:Npn \@@_too_long_error:NNw #1#2 \fi: { \fi: \msg_error:nneee { sort } { too-large } { \token_to_str:N #2 } { \int_eval:n { \l_@@_true_max_int - \l_@@_min_int } } { \int_eval:n { \l_@@_top_int - \l_@@_min_int } } #1 \@@_error: } \msg_new:nnnn { sort } { too-large } { The~list~#1~is~too~long~to~be~sorted~by~TeX. } { TeX~has~#2~toks~registers~still~available:~ this~only~allows~to~sort~with~up~to~#3~ items.~The~list~will~not~be~sorted. } % \end{macrocode} % \end{macro} % % \begin{macrocode} \msg_new:nnnn { sort } { return-none } { The~comparison~code~did~not~return. } { When~sorting~a~list,~the~code~to~compare~items~#1~and~#2~ did~not~call~ \iow_char:N\\sort_return_same: ~nor~ \iow_char:N\\sort_return_swapped: .~ Exactly~one~of~these~should~be~called. } \msg_new:nnnn { sort } { return-two } { The~comparison~code~returned~multiple~times. } { When~sorting~a~list,~the~code~to~compare~items~#1~and~#2~called~ \iow_char:N\\sort_return_same: ~or~ \iow_char:N\\sort_return_swapped: ~multiple~times.~ Exactly~one~of~these~should~be~called. } \prop_gput:Nnn \g_msg_module_name_prop { sort } { LaTeX } \prop_gput:Nnn \g_msg_module_type_prop { sort } { } % \end{macrocode} % % \begin{macrocode} % % \end{macrocode} % % \end{implementation} % % \PrintIndex