Reducir el tamaño de multiples PDF desde un bash script

A veces la editoriales suben pdfs en alta, y es necesario reducir el tamaño para poder usarlos. Acá les dejo un script que me ayudó a automatizar esta tarea.

Bueno, este es mi código:

#!/bin/bash
if [ ! -f /tmp/hmhbaja.lock ]; then
#compruebo que no hay otra instancia corriendo
touch /tmp/hmhbaja.lock
    for i in $(cat /etc/hmhbaja/paths); do
        path1=$i"/"
        path2=$i"_low/"
	rsync -r --ignore-existing $path1 $path2
        if [ -f $path2.marca ]; then
    	    find $path2 -iname \*.pdf -cnewer $path2.marca -execdir /root/pdf2low.pl \{\} \;
	else
    	    find $path2 -iname \*.pdf -cnewer -execdir /root/pdf2low.pl \{\} \;
        fi
        touch $path2.marca
    done;
rm /tmp/hmhbaja.lock
fi

Tiene un archivo de configuración con los paths:

/var/ftp/virtual1/hmhreadingse/Grado2/TE_Reprints/07-Digitals
/var/ftp/virtual1/hmhreadingse/Grado3/TE_Reprints/07-Digitals

y el archivo que hace la magia, o sea, reduce el temaño del archivo, el archivo es propiedad de Wolfgang Dobler, para que funcione obiamente necesita pdftk.

#!/bin/sh
#  -*-Perl-*-
# ====================================================================== #
# Run the right perl version:
if [ -x /usr/local/bin/perl ]; then
  perl=/usr/local/bin/perl
elif [ -x /usr/bin/perl ]; then
  perl=/usr/bin/perl
else
  perl=`which perl| sed 's/.*aliased to *//'`
fi

exec $perl -x -S $0 "$@"     # -x: start from the following line
# ====================================================================== #
#! /Good_Path/perl -w
# line 17

# Name:   compress-newsletter
# Author: wd (Wolfgang.Dobler@ucalgary.ca)
# Date:   03-Oct-2005
# Description:
#   Use ghostscript's pdfwrite device (à la ps2pdf) to reduce the
#   Newsletter's PDF file size, and add meta information like author,
#   date, etc.
#   The preferred route is currently:
#                 [scribus>=1.2.3]
#                        |
#                    file.pdf
#                        |
#                 [pdftops>=3.00]
#                        |
#                     file.ps
#                        |
#            [pstopdf14 (gs-gnu-8.16 or higher)]
#                        |
#                        V
#                    final.pdf
# Usage:
#   compress-newletter [-i col:gray:mono] Newsletter_big.pdf
# Options:
#   -i col:gray:mono
#   --imgres=col:gray:mono   Set resolution for downsampling color,
#                            grayscale and black-and-white images
#                            (default is 144:300:300)
#   --debug                  Be verbose and keep temporary files around
use strict;
use File::Temp qw/ :mktemp /;

use Getopt::Long;
# Allow for `-Plp' as equivalent to `-P lp' etc:
Getopt::Long::config("bundling");

my (%opts);			# Options hash for GetOptions
my $doll='\$';			# Need this to trick CVS

## Process command line
GetOptions(\%opts,
	   qw( -h   --help
	       -i=s --imgres=s
	            --debug
	       -q   --quiet
               -v   --version ));

my $debug = ($opts{'debug'} ? 1 : 0 ); # undocumented debug option
if ($debug) {
    printopts(\%opts);
    print "\@ARGV = `@ARGV'\n";
}

if ($opts{'h'} || $opts{'help'})    { die usage();   }
if ($opts{'v'} || $opts{'version'}) { die version(); }

my $quiet  = ($opts{'q'} || $opts{'quiet'}  || ''           );
my $imgres = ($opts{'i'} || $opts{'imgres'} || '144:300:300');

my ($gs,      @gsargs     ) = ('gs'     );
my ($pdftops, @pdftopsargs) = ('pdftops');
my ($pdfopt,  @pdfoptargs ) = ('pdfopt' );

my $infile = shift or die usage();
(my $root=$infile) =~ s/\.(pdf|ps).*//;
(my $outfile=$infile) =~ s/(.*)(\.(pdf|ps))/${1}_new${2}/;
my $tmpfile = mktemp("${root}.tmp_XXXXXX");

## 0. Extract all sorts of information

# Extract Scribus version, creation date, bookmarks from original PDF:
print "Running pdftk ...\n";
print STDERR "pdftk $infile dump_data output\n" if ($debug);
my $meta = `pdftk $infile dump_data output -`;
my ($creator) = ( $meta =~
		  m{InfoKey: Creator\s+InfoValue:\s*(.+)$}m
		);
$creator = 'Scribus 1.2.3' unless defined($creator);
my $datestring = extract_CreationDate($meta);
my @bookmarks = extract_bookmarks($meta);

# Extract desired image resolutions
my ($colres,$grayres,$monores) = ($imgres =~ /([0-9]+):([0-9]+):([0-9]+)/);
die "Image resolution must be of form `col:gray:mono'\n"
    unless defined($monores);

## 1. Run pdftops
push @pdftopsargs, "-level3";
my $psfile = mktemp("${root}.ps_XXXXXX");
push @pdftopsargs, $infile, $psfile;
print "Running pdftops ...\n";
print STDERR "$pdftops @pdftopsargs\n" if ($debug);
system($pdftops,@pdftopsargs);

## 2. Run gs
# a) Prepare options
push @gsargs, qw{-q -dNOPAUSE -dBATCH};
push @gsargs, '-sDEVICE=pdfwrite';
push @gsargs, '-dCompatibilityLevel=1.3';
# One of /printer, /screen, /prepress, /ebook, /default; see Ps2pdf.htm:
push @gsargs, '-dPDFSETTINGS=/screen';
push @gsargs, '-dEmbedAllFonts=true';
push @gsargs, '-dSubsetFonts=true';
push @gsargs, '-dColorImageDownsampleType=/Bicubic';
push @gsargs, "-dColorImageResolution=$colres";
push @gsargs, '-dGrayImageDownsampleType=/Bicubic';
push @gsargs, "-dGrayImageResolution=$grayres";
push @gsargs, '-dMonoImageDownsampleType=/Bicubic';
push @gsargs, "-dMonoImageResolution=$monores";
push @gsargs, "-sOutputFile=$tmpfile";
push @gsargs, "-c .setpdfwrite";

# b) Write meta information to temporary file
#my $metafile = mktemp("metainfo.tmp_XXXXXX");
my $metafile = "${root}.meta";
open(META, "> $metafile");
print META <<"DEAD_PARROT";
% Document information
[%
 /CreationDate (D:$datestring)
 /ModDate (D:$datestring)
 /Creator ($creator)
 /Title ([Insert your document title here])
 /Subject ([Insert the Subject here])
 /Keywords ([Insert key words here])
 /Author ([Insert author' nsme here])
 /DOCINFO pdfmark

% Initial view on opening the document
[/View [/Fit] % Fit page in window
 /Page 1
 % /PageMode /UseOutlines % /UseNone /UserOutlines /UseThumbs /FullScreen
 /DOCVIEW pdfmark

DEAD_PARROT

## Bookmarks. [Commented out for acroread 7.0 has problems] Currently at
## the mercy of the original bookmarks (and Scribus 1.2.2 does not allow
## to edit the bookmark names) and the encoding that pdftk understands
## (most quotation marks get mapped to `?').
## Ideally, one would write out the meta information file with
## `compress-newsletter -m CC.pdf' and use it then with
## `compress-newsletter CC.pdf'.
## % Bookmarks: @bookmarks

push @gsargs, '-f', $psfile, $metafile;
print "Running gs ...\n";
print STDERR "$gs @gsargs\n" if ($debug);
system($gs,@gsargs);

## 3. Run pdfopt
print "Running pdfopt ...\n";
print STDERR "$pdfopt @pdfoptargs $tmpfile $outfile\n" if ($debug);
system($pdfopt,@pdfoptargs,$tmpfile,$outfile);

# Some diagnostics:
system('rm', $infile, $psfile, $tmpfile,$metafile);
system('mv',$outfile,$infile);

END {
    # Clean up even in case of an error:
    unless ($debug) {
        foreach my $file ($psfile,$tmpfile) {
	    unlink $file if (defined($file) && -f $file);
        }
    }
}

# ---------------------------------------------------------------------- #
sub extract_CreationDate {

    use POSIX qw(strftime);

    my $meta = shift;

    my ($cdate) = ( $meta =~
		    m{InfoKey: CreationDate\s+InfoValue:\s*(.+)$}m
		  );
    # Time string: need to splice in "'" after hours and minutes of time zone
    # definition. To me this looks like the technical documentation was taken
    # too literally and now applications (and Acroread 7) insist on these
    # stupid markers.
    my $datestring;
    if ($cdate =~ /[0-9]{14}/) { # managed to extract CreationDate from $meta
	$datestring = "$cdate-06'00'";
    } else {		         # Creation date unknown -- use current date
	my $tz = strftime "%z", localtime();
	$tz =~ s/([0-9][0-9])([0-9][0-9])/$1'$2'/;
	$datestring = strftime "%Y%m%d%H%M%S$tz", localtime();
    }

    $datestring;
}
# ---------------------------------------------------------------------- #
sub extract_bookmarks {

    my $meta = shift;

    my @bm;

    while ($meta =~ /^BookmarkTitle:      \s* (.*) \n
                      BookmarkLevel:      \s* (.*) \n
                      BookmarkPageNumber: \s* (.*) /xmg) {
	my ($title,$level,$page) = ($1,$2,$3);
	push @bm, "[/Title ($title /Page $page /OUT pdfmark\n";
    }

}
# ---------------------------------------------------------------------- #
sub printopts {
# Print command line options
    my $optsref = shift;
    my %opts = %$optsref;
    foreach my $opt (keys(%opts)) {
	print STDERR "\$opts{$opt} = `$opts{$opt}'\n";
    }
}
# ---------------------------------------------------------------------- #
sub usage {
# Extract description and usage information from this file's header.
    my $thisfile = __FILE__;
    local $/ = '';              # Read paragraphs
    open(FILE, "<$thisfile") or die "Cannot open $thisfile\n";
    while () {
	# Paragraph _must_ contain `Description:' or `Usage:'
        next unless /^\s*\#\s*(Description|Usage):/m;
        # Drop `Author:', etc. (anything before `Description:' or `Usage:')
        s/.*?\n(\s*\#\s*(Description|Usage):\s*\n.*)/$1/s;
        # Don't print comment sign:
        s/^\s*# ?//mg;
        last;                        # ignore body
    }
    $_ or "\n";
}
# ---------------------------------------------------------------------- #
sub version {
# Return CVS data and version info.
    my $doll='\$';		# Need this to trick CVS
    my $cmdname = (split('/', $0))[-1];
    my $rev = '$Revision: 1.8 $';
    my $date = '$Date: 2006/02/02 09:38:52 $';
    $rev =~ s/${doll}Revision:\s*(\S+).*/$1/;
    $date =~ s/${doll}Date:\s*(\S+).*/$1/;
    "$cmdname version $rev ($date)\n";
}
# ---------------------------------------------------------------------- #

# End of file compress-newsletter

Espero les serva ;)

Related posts:

  1. Indexar pdfs y words a myqsl
  2. Como tener multiples instancias de mysql en el mismo servidor
  3. Perl script para calcular el page rank
  4. Migrar correos desde dattatec (u otro hosting)
  5. Script para importar varios shapes a postgis

Tags: , , ,

Leave a Reply