Newer
Older
Master_thesis / thesis / wordcount.tex
\ProvidesFile{wordcount.tex}[2000/09/27 v1.5 Michael Downes]
% Copyright 2000 Michael John Downes
% This file has no restrictions on its use, distribution, or sale.
%
% If you run LaTeX on wordcount.tex it will prompt you for the name of a
% document to be counted. For most people, however, it will be more
% convenient to run the shell script wordcount.sh, giving the document
% name as the first argument. The comments in wordcount.sh
% give further information about the usage and limitations of this tool.

% The fundamental idea is to mark each character and interword space
% with a unique tag that will show up in TeX "showbox" output. Then
% arrange to make the output routine trigger a TeX overfull vbox message
% for the page box so that everything gets reported in the TeX log.
% Then run grep -c (or an equivalent text search utility, e.g., perl) on
% the log file to count the occurrences.
%
% In showbox output, a character is typically represented by a line of
% the form
%
%   ...\T1/cmr/m/n/10 e
%
% where "\T1/cmr/m/n/10" is the font name. We arrange things so that all
% characters are typeset in a font named "\3.08632". Furthermore, for
% interword spaces, which are normally represented in the output as
% something like
%
%   ...\glue 3.33333 plus 1.66666 minus 1.11111
%
% we use \spaceskip to ensure that it always appears as
%
%   ...\glue(\spaceskip) 3.08633 plus 9.0
%
% Then to count characters (including word spaces) we can
% grep for lines matching "3.0863", while to count words we
% count the number of lines in the log that match "3.08633" but not
% "3.08632".
%
% For an accurate count in multi-line paragraphs we also need
% to set rightskip to the flag value, since interword spaces are
% discarded at line breaks; and on the presumption that hyphenated
% compounds like "steady-state" should be counted as two words, we set
% \exhyphenpenalty to a value that ensures a line break at *every*
% explicit hyphen---then we will get a rightskip glue node between the
% two parts. (How about em-dashes? you may ask. Answer: it works all
% right, try it and see.)

\ifx\relax\?\def\wcQUIET{TT}\else\def\wcQUIET{TF}\fi

% We want accented letters to be represented by a single font char, to
% the extent possible:
\RequirePackage[T1]{fontenc}
\makeatletter
\thickmuskip=5.55555mu plus5.55555mu minus 2.22222mu
\let\@@thickmuskip\thickmuskip \newmuskip\thickmuskip
\let\@@spaceskip\spaceskip \xspaceskip=0pt
\newskip\@charfontname
% Ensure that these cannot be overridden.
\let\spaceskip\@charfontname \let\xspaceskip\spaceskip
\def\FreezeSpaces{%
  \setbox\z@\hbox{\mathsurround=\z@ $\mkern\@@thickmuskip$}%
  \@@spaceskip=\wd\z@ plus 9pt\relax
  % Setting rightskip to a slightly different value means that
  % end-of-line spaces can be distinguished if desired. (By default,
  % they are not.) Using fil units means there won't be any hyphenation
  % except for explicit hyphens.
  \dimen@\@@spaceskip \advance\dimen@ 1sp\relax
  \rightskip\z@ plus\strip@pt\dimen@ fil\relax
  % Prevent any further changes to \rightskip.
  \let\rightskip\spaceskip
  \@charfontname=\wd\z@ \advance\@charfontname -1sp\relax
  % Cancel all non-1000 sfcodes, which might otherwise affect the value
  % of inter-sentence spaces and keep them from being counted as
  % interword spaces.
  \def\do{\sfcode\fam\@m \advance\fam\@ne
    \ifnum\fam>\@cclv\expandafter\@gobble\fi \do
  }%
  \fam\z@ \do
  \let\frenchspacing\relax \let\nonfrenchspacing\relax
}
\def\DashPatch{%
  \exhyphenpenalty=-\@M \let\exhyphenpenalty\count@
  \hyphenpenalty\@M \let\hyphenpenalty\count@
  % LaTeX bug? Default definitions for these don't have \exhyphenpenalty
  % built in. Change them to just print a hyphen so that the penalties
  % will work.
  \def\textemdash{-}\def\textendash{-}%
}
% Here we set the font to ptmr8t (a T1 encoded font for which most
% people are likely to have a tfm file) and then make it difficult for
% the font to be changed ever again. We use two names for the font,
% \ptmr and \3.08632, the former being easier to use when resetting math
% fonts. The name that is defined last when TeX processes a document is
% the one that will be used in the log file.
\def\FreezeFont{%
  \expandafter\font
    \csname\strip@pt\@charfontname\endcsname
    =ptmr8t \relax
  \ptmr
  \check@mathfonts \let\check@mathfonts\relax
  \def\do{%
    \textfont\fam=\ptmr\scriptfont\fam=\ptmr\scriptscriptfont\fam=\ptmr
    \advance\fam\@ne \ifnum\fam<\sixt@@n \else\expandafter\@gobble\fi
    \do
  }%
  \fam=0 \do
}
\font\ptmr=ptmr8t \relax
\fontdimen22\ptmr=1sp % to permit use as math font 2
\let\selectfont\relax
\let\fontshape\@gobble \let\fontseries\@gobble \let\fontfamily\@gobble 
\let\fontencoding\@gobble \let\fontsize\@gobbletwo
\let\linespread\@gobble 
\let\try@load@fontshape\relax
\@namedef{U/msa/m/n}{}% disable special test in AMS documentclasses
\def\define@newfont{\expandafter\let\font@name=\ptmr}
\let\@@hfuzz\hfuzz \let\@@vfuzz\vfuzz
\newdimen\hfuzz \let\vfuzz\hfuzz
\def\QuietBoxes{%
  \@@hfuzz\maxdimen \@@vfuzz\maxdimen
}
\AtBeginDocument{\FreezeSpaces \FreezeFont \DashPatch \QuietBoxes}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Halign patches
\def\ialign{\everycr{}\tabskip\@@spaceskip\halign} % initialized \halign
\def\extracolsep#1{\tabskip\@@spaceskip}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\nofiles \let\makeindex\@empty % work-around for a natbib problem
% Seize control of the output routine to avoid extra clutter of dvi
% files. Handling \AtBeginDVI would take some extra work--for now I
% won't bother.
\def\showpagebox{%
  \showboxbreadth\maxdimen\showboxdepth99
  % This is a devious way of getting the box contents into the log file
  % without actually using \showbox (which typically results in a
  % nonzero exit code from the TeX run).
  \@@vfuzz\z@
  \setbox\z@\vbox to\z@{\box\@outputbox}%
  \setbox\z@\box\voidb@x
  \global \@colht \textheight
  \stepcounter{page}%
  \let\firstmark\botmark
}
\AtBeginDocument{\def\@outputpage{\showpagebox}}
\if\wcQUIET\else
\begingroup \def\msg^^J#1]^^J{\endgroup\message{#1]^^J^^J}}
\endlinechar=`\^^J \catcode`\ =12 \msg

The 'wordcount' utility provides a way of estimating the number of
words in a LaTeX document.

[Warning: The counting method uses a large log file as an intermediate
step. If your disk space is extremely low the counting process
may end prematurely for lack of space.]

\fi

% \typein has extraneous blank lines and slightly idiosyncratic prompt.
\begingroup \endlinechar=-1
\message{File name [press RETURN to cancel]: }
\def\do{%
  \let\do\@makeother \dospecials
  \global\read-1 to\wcFileName
  \endgroup
}
\do

\ifx\@empty\wcFileName
  \typeout{No file name given; quitting.}\expandafter\@@end\fi

\expandafter\filename@parse\expandafter{\wcFileName}
\edef\jobname{\filename@base}

\if\wcQUIET
  \message{\wcFileName}
\else
\typeout{^^JTo get a count of characters, run^^J^^J \space
%
  grep -c '3[.]0863[23]' wordcount.log^^J^^J%
%
or equivalent when processing is finished.^^J%
}
\fi

\typeout{Processing \wcFileName...^^J}

\IfFileExists{\wcFileName}{\batchmode}{\nonstopmode}
\edef\@tempa{\noexpand\makeatother
  \noexpand\input{\wcFileName}\noexpand\stop
}
\@tempa