#! /usr/bin/env perl

# Use:
# $ perldoc texdirflatten
# to see embedded documentation. Alternatively, you can create manual
# or HTML pages using pod2man and pod2html.

=pod

=head1 NAME

B<texdirflatten> - Collects all components of a (La)TeX file in a
single output directory -- i.e., flattens its hierarchy.

=head1 SYNOPSIS

texdirflatten [-1|--onetex] [-f|--file input.tex] [-o outputdir] 
    	      [--imgexts .ext1,.ext2,...] [--debug] [-V|--version] [-?|--help]

=head1 DESCRIPTION

This Perl script parses a LaTeX file recursively, scanning all child
files, and collects details of any included and other data files, such
as graphics and BiBTeX bibliography files. These component files, are
then all put into a single directory (thus "flattening" the document's
directory tree). This is useful in distributing manuscripts to
collaborators or in submitting to journals.

=for comment
Run without parameters to see usage.

=head1 OPTIONS

=over

=item B<--file>, B<-f> I<input.tex>

Specifies input (La)TeX file.

=item B<--onetex>, B<-1>

If specified, produces a single TeX file by expanding all \input and
\include commands in place.

=item B<--output>, B<-o> I<outputdir>

Directory to collect all files. B<texdirflatten> will copy each source
file, graphics and bibliography file to this directory. It will be
created if it is unexistent. If unspecified, it defaults to C<flat/>.

=item B<--imgexts> I<.ext1,.ext2[,...]>

Prepends to the prioritized list of image extensions to search when
trying to find the image to copy to output folder. The first file with
the extension found will be copied and search stopped. Default order
is C<"", ".eps", ".pdf", ".pstex"> (note that it includes files with
no extension). If using pdflatex, one may want to add PDF before EPS
with C<--imgexts .pdf>. Don't forget the dot before the extension!

=item B<--debug>

Enables copious amounts of debugging output - useful if something is going wrong.

=item B<--version>, B<-V>

Displays the current version number and the usage and exits.

=item B<--help>, B<-?>

Show this manual page.

=back

=head1 EXAMPLES

The following example scans C<manuscript.tex> in the current directory
and gathers it and all its components in the C<submit_01/> directory:

 $ texdirflatten -f manuscript.tex -o submit_01

=head1 CAVEATS

Please take backups before running this command. No warranties
whatsoever provided.

You may need to run C<epstopdf> on EPS files if you are using C<pdflatex>:

 $ for i in *.eps; do epstopdf $i; done

=head1 BUGS

Bug reports and patches are welcome.

=head1 AUTHOR

Cengiz Gunay <cengique<AT>users.sf.net>

=head1 COPYRIGHT AND LICENSE

Copyleft 2003-2017, Cengiz Gunay

This library is free software; you may redistribute it and/or modify
it under the same terms as Perl itself.

=cut

# TODO:
# - parse BIBINPUTS environment variable and search figures and such
# there as well.
# - also copy bibtex style file

package texdirflatten;

use Getopt::Long qw(GetOptions);
use Pod::Usage;
use strict;
use warnings;
use re 'eval';

# Global
my $outputdir   = "flat";	# Output directory
my $file = "";			# File to read
my @imgexts = ("", ".eps", ".pdf", ".pstex"); # Ordered list of image extensions to search and copy
my $imgexts_pre;			    # Prepend to @imgexts
my $onetex = 0;			# Indicates all input and include commands be expanded
my $version = "1.3";		# 2017-02-02
my $DEBUG = 0;			# Enable debugging output
my $help;

# parse helpers
my %texfiles;

# GetOpt::Long::
my $result = GetOptions ("output|o=s" => \$outputdir,
			 "onetex|1" => \$onetex,
			 "file|f=s" => \$file,
			 "imgexts=s" => \$imgexts_pre,
			 "debug" => \$DEBUG,
			 "version|V" => sub { print "texdirflatten version $version\n"; 
					      pod2usage( -section => "SYNOPSIS"); },
			 "help|?" => \$help );

# regexp for none or an even number of backslashes (\):
my $unescaped =
      '(?<! \\\\)(?(?= \\\\) (?: \\\\\\\\)* )?';

# counter for files translated (like LyX does)
$::filecount = 0;

pod2usage( -verbose => 2 ) if ($help);

pod2usage( -section => "SYNOPSIS") if ($file eq "");

if ($imgexts_pre) {
    unshift(@imgexts, split(/,/, $imgexts_pre));
    print "imgexts: ". join(",", @imgexts) . "\n" if $DEBUG;
}

system "mkdir $outputdir" if (not -d $outputdir);

# start recursing
parseTeX($file, "", popfile($file), 0);

sub parseTeX {
  my $file = shift;
  my $inputdir = shift;
  my $outfile = "$outputdir/" . shift;
  my $level = shift;

  # Add a proper TeX suffix if it's missing
  $outfile .= '.tex' if ! ($outfile =~ /\.tex\s*$/i );

  my @flats;
  my @longs;
  my $popped;

  open my $FILE, $file or die "Cannot find file to read $file\n";

  # Read to whole file first, and then scan for regexps
  my $contents = "";
  while(<$FILE>) {
    $contents = $contents . $_;
  }
  close $FILE;

  ## First, remove comments so that input commands inside are ignored.
  # Also remove trailing newline and all whitespace at the beginning
  # of the following line like TeX does it. Only for %'s following
  # the proper backslash pattern.
  $contents =~ s/( $unescaped )\%.*?\n[ \t]*/$1/gx;

  # Look for graphics include statement
  if ($contents =~ /\\input\@path\{\{([^}]*)\}\}/) {
    $inputdir = $1;
    print "Found input directory: $inputdir\n" if $DEBUG;
  }

  print "Parsing '$file' in directory '$inputdir'\n";

  # Default value
  #$inputdir = './' if (-z $inputdir);

  # three cases: graphics, inputs and bibs

  # an \includegraphics statement
  @::flats = ();
  @::longs = ();
  #$::popped = "";

  $contents =~
    s/\\includegraphics(\[[^\]]*\])?\{([^}]*)\}
      (?{ $::popped = flattenfilename($2);
	push @::longs, $2; push @::flats, $::popped })
     /\\includegraphics$1\{$::popped\}/gmx;

  if ($#::flats > -1) {
    my ($long, $flat);

    foreach $long (@::longs) {
      $flat = shift @::flats;

      # convert LyX directory dots
      $long =~ s/\\lyxdot /./g;
      $flat =~ s/\\lyxdot /./g;

      print "Looking for graphics: '$long'\n" if $DEBUG;

      my @dirs = ("", "$inputdir", $inputdir . '../figures/');

      seekfile($long, $flat, \@imgexts, 
	       ["", "$inputdir", $inputdir . '../figures/']);

    }
  }

  # Must do bibs before input commands because, when expanded, they
  # add new bibs inside.
  replacebibs(\$contents, $inputdir);

  # an \input or \include statement
  @flats = ();
  @longs = ();
  $contents =~
    s/\\(input|include)\{([^}]*)\}/"\\${1}\{" .
      ($popped = flattenfilename($2) and push @longs, $2 and push @flats,$popped and $popped)
	. "\}"/egm;

  if ($#flats > -1) {
    #print "Found " . scalar @longs . " items on line: '@flats'.\n";

    my ($long, $flat);

    foreach $long (@longs) {
      $flat = shift @flats;
      print "Found Input/Include: '$long'\n" if $DEBUG;
      # recurse to parse that tex file unless it's done already

      if (not exists $texfiles{$flat}) {
	if (! -r $long ) {
	  if (-r $inputdir . $long) {
	    $long = $inputdir . $long;
	  } elsif (-r "$long.tex") {
	    $long .= '.tex';
	  }
	}
	$texfiles{$flat} = $long;
	my $newcontents = parseTeX($long, basedir($long), $flat, $level + 1); #later
	if ( $onetex ) {
	  # If specified, expand inclusion directives
	  die "Failed to find input/include to expand for '$flat'!\n"
	    if (! ( $contents =~
		s/\\(input|include) \{ \Q$flat\E \} 
                  (?{ print "===Found \\$1\{$flat\} to expand.\n" if $DEBUG; })
		  / ($1 eq 'include' ? "\\clearpage\n" : '') . "$newcontents"/gxem ) );
	}
      }
    }
  }

  #print "Writing to \"$outfile\"\n" .
  #  "----------------------------------------\n" .
  #  "$contents\n" .
  #  "----------------------------------------\n";

  if (! $onetex || $level == 0) {  # do output at zero recurse level
    # open flat output file
    open my $COPY, ">$outfile" or die "Cannot write file $outfile\n";

    # write to flat copy
    print $COPY $contents;

    close $COPY;
  } else {
    return $contents;
  }
}

# Look for file to copy
sub seekfile {
    my ($file, $dest, $exts, $dirs) = @_;
    my @exts = @$exts;
    my @dirs = @$dirs;
    
  OUT: foreach my $dir (@dirs) {
      foreach my $ext (@exts) {
	  my $tryfile = "$dir$file$ext";
	  if (-r $tryfile) {
	      # add the extension to $dest only if it hasn't have any dots
	      #(! ($dest =~ /\./)) ? $ext : ""; => better not to have this
	      my $dext = $ext; 
	      
	      if (system("cp $tryfile $outputdir/$dest$dext") != 0) {
		  die "Cannot copy $tryfile!\n";
	      }
	      # If we found it, then get out of the loop
	      print "Copied: '$tryfile' (ext: $ext) -> '$dest$dext'\n" if $DEBUG;
	      last OUT;
	  }
      }
  }
}

# Return everything after the last /
sub popfile {
  my $file = shift (@_);

  my @a = split /\//, $file;

  #print "Popping $file... split: @a ($#a), pop: " . pop(@a) . "\n";
  pop @a;
}

# Make a new file by flattening directory names
sub flattenfilename {
  local $_ = shift;

  # convert slashes to pluses, dots to Xs
  tr|/|+|;
  tr|\.|X|;

  # add a unique number at the beginning so files don't start with a
  # dot
  $_ = sprintf("%03d_", ++$::filecount) . "$_";

  print "===Flattened file to '$_'\n" if $DEBUG;

  return $_;
}

sub basedir {
  my $file = shift (@_);

  my @a = split /\//, $file;

  #print "Popping $file... split: @a ($#a), pop: " . pop(@a) . "\n";
  pop @a;			# remove file part
  my $basedir = join('/', @a);		# combine the rest with /'s
  $basedir .= '/' if (! $basedir =~ /^$/);		# add a trailing / if not empty
  $basedir;
}

# get the bibliography files
sub replacebibs {
  my ($contentsref, $inputdir) = @_;
  #my $contents = $$contentsref;
  my $popped;

  #print "bib contents: $contents\n";

  local @::flats = ();
  local @::longs = ();
  local @::refs;

  $" = ',';			# set list item separator
  # search for \bibliography statements
  $$contentsref =~
    s/\\bibliography(\[[^\]]*\])? {
      (?{ @::refs = (); #print "Found bibliography!\n"; #initialize ref list
        }) (?: \s* ([^}, ]+)(?=[,}]),?
      (?{ # for each word do:
	  #local (@_flats, @_longs, @_refs);
          #print "Word: \"$+\", "; 
          $popped = flattenfilename($+); push @::longs, $+;
          push @::flats,$popped; push @::refs, $popped;
        }) )+ \s* }
      (?{ #print "\n+++ $#::flats; $#::refs; $#::longs\n";
          #push @::flats, @_flats; push @longs, @_longs; push @refs, @_refs;
          #print "Refs: @::refs\n";
    })/"\\bibliography" . ($1 || "") . "{@::refs}"/egmx;# &&
      #print "bib contents: $$contentsref\n";
      #print "\n*** $#::flats; $#::refs; $#::longs\n";
  $" = ' ';			# restore list item separator

  if ($#::flats > -1) {
    #print "Found " . scalar @::longs . " items on line: '@::flats'.\n";

    my ($long, $flat);

    foreach $long (@::longs) {
      $flat = shift @::flats;
      print "Found bib. file: '$long'\n" if $DEBUG;
      # recurse to parse that tex file unless it's done already

      # convert LyX directory dots
      $long =~ s/\\lyxdot /./g;

      seekfile($long, "$flat", [".bib"], 
	       ["", "$inputdir"]);
    }
  }
}