news.utdallas.edu!wupost!howland.reston.ans.net!agate!doc.ic.ac.uk!uknet!mcsun!sun4nl!mhres!pronto!news Tue Mar  9 09:38:59 CST 1993
Article: 1468 of comp.lang.perl
Xref: feenix.metronet.com comp.lang.perl:1468
Path: feenix.metronet.com!news.utdallas.edu!wupost!howland.reston.ans.net!agate!doc.ic.ac.uk!uknet!mcsun!sun4nl!mhres!pronto!news
From: jv@mh.nl (Johan Vromans)
Newsgroups: comp.lang.perl
#Subject: WordPerfect document scanner
Message-ID: <C3MIA9.M5F@pronto.mh.nl>
Date: 9 Mar 93 13:18:08 GMT
Sender: news@pronto.mh.nl (USEnet News System)
Followup-To: alt.sources.d
Organization: Multihouse Automation, the Netherlands
Lines: 288
X-Md4-Signature: d90d6f205746a8854b154ba6d0177648
Nntp-Posting-Host: largo:(jv)

This program reads a WordPerfect document and outputs its contents in
a way suitable to study and analyze it.
It can be used as a driver to perform your own WP to whatever
conversions. 
Disclaimer: this program is not supported, use at your own risk.

	Johan

---- Cut Here and feed the following to sh ----
#!/bin/sh
# This is a shell archive (produced by shar 3.49)
# To extract the files from this archive, save it to a file, remove
# everything above the "!/bin/sh" line above, and type "sh file_name".
#
# made 03/01/1993 15:47 UTC by jv@largo
# Source directory /a/pronto/mozart/users/jv
#
# existing files will NOT be overwritten unless -c is specified
#
# This shar contains:
# length  mode       name
# ------ ---------- ------------------------------------------
#   6374 -rw-r--r-- wpscan.pl
#
# ============= wpscan.pl ==============
if test -f 'wpscan.pl' -a X"$1" != X"-c"; then
	echo 'x - skipping wpscan.pl (File already exists)'
else
echo 'x - extracting wpscan.pl (Text)'
sed 's/^X//' << 'SHAR_EOF' > 'wpscan.pl' &&
X#!/usr/local/bin/perl
X# wp2txt.pl -- convert WP document to MH-Doc format
X# SCCS Status     : %Z%@ %M%	%I%
X# Author          : Johan Vromans
X# Created On      : Mon Jul 13 21:23:19 1992
X# Last Modified By: Johan Vromans
X# Last Modified On: Mon Mar  1 16:43:01 1993
X# Update Count    : 315
X# Status          : Unknown, Use with caution!
X
X################ Common stuff ################
X
X# $LIBDIR = $ENV{"MH_DOCLIB"} || "/usr/local/lib/mh_doc";
X# unshift (@INC, $LIBDIR);
X# require "common.pl";
X$my_package = "PRONTO/MH-Doc V1.02";
X$my_version = "%I%";
X$my_name = "%M%";
X
X################ Program parameters ################
X
X$opt_start = $opt_end = 0;
X$opt_outline = 0;
X$opt_analyze = 1;
X$opt_short = 0;
X$opt_attr = 0;
X$opt_header = 1;
X$opt_verbose = 0;
X&options if $ARGV[0] =~ /^-/;
X
X################ Presets ################
X
X@ARGV = ("-") unless @ARGV;
Xopen (WPD, @ARGV[0]) || die ("$ARGV[0]: $!\n");
X&wpd_check;
X
Xif ( $opt_analyze ) {
X    &analyze;
X}
Xelse {
X    &convert;
X}
X
Xexit (0);
X
X################ Subroutines ################
X
X# Extract a two-byte (little endian) word from the input.
Xsub opc_word {
X    shift (@opc) | (shift (@opc) << 8);
X}
X
X# Skip a number of bytes.
Xsub opc_skip {
X    local ($dis) = @_;
X    splice (@opc, 0, $dis);
X}
X
X# Verify validity of WP document.
Xsub wpd_check {
X
X    local ($hdr) = '';
X
X    sysread (WPD, $hdr, 16);
X    local ($id, $std, $filetype) = unpack ("a4Vv", $hdr);
X
X    if ($id eq "\xffWPC" &&
X	$filetype == 0x0A01 ) {
X
X	# Seek to start of document, and slurp it.
X	$opt_start = $std unless $opt_start;
X	seek (WPD, $opt_start, 0);
X	local ($/) = undef;
X	$wpd = <WPD>;
X	$wpdptr = $opt_start;
X
X	# Truncate if requested.
X	$end = $wpdptr + length ($wpd) - 1;
X	if ( $opt_end > $opt_start ) {
X	    substr ($wpd, $opt_end-$wpdptr) = '';
X	}
X	else {
X	    $opt_end = $end;
X	}
X
X	# Feedback
X	if ( $opt_debug || $opt_analyze ) {
X	    printf STDERR ("Startdoc @ 0x%x (%d)%s\n", $std, $std,
X			   ($opt_start != $std || $opt_end != $end) ?
X			   sprintf (", range is 0x%x (%d) thru 0x%x (%d)",
X				    $opt_start, $opt_start,
X				    $opt_end-1, $opt_end-1) : '');
X	}
X    }
X    else {
X	die ("Not a WP document\n");
X    }
X}
X
X# The main processors --
X
Xsub analyze {
X
X    local ($opc);		# opcode under examination
X    local ($opc_type);		# type (0 = string, 1 = bytes)
X    local ($opc_len);		# total length op opcode sequence
X    local ($opc_char);		# text, if $opc_type == 0
X    local (@opc);		# bytes, if $opc_type == 1
X
X    while ( &fetch >= 0 ) { ; }	# &fetch will do all
X}
X
Xsub options {
X
X    require "newgetopt.pl";
X    local ($opt_noheader) = 0;
X
X    $opt_help = $opt_ident = 0;
X
X    if ( ! &NGetOpt 
X	("start=s", "end=s",
X	 "analyze", "short", "outline",
X	 "attr", "header", "noheader",
X	 "verbose", "ident", "quiet", "help", "debug") 
X	    || $opt_help ) {
X	print STDERR <<EndOfUsage;
XThis is $my_package [$my_name $my_version]
X
XUsage: $0 [options] file...
X
X  options are:
X    -output file	destination
X    -start XXX		start conversion at address XXX
X    -end XXX		end conversion before address XXX
X    -ident		print program name and version
X    -verbose		verbose info
X    -attr		include traces of attributes
X    -noheader		suppress MH-Doc header
X    -analyze		analyze input
X    -outline		make outline
X    -short		with -analyze: shorten long opcodes
X    -help		this message
XEndOfUsage
X	exit (1);
X    }
X
X    $opt_start = oct ($opt_start) if $opt_start =~ /^0/;
X    $opt_end = oct ($opt_end) if $opt_end =~ /^0/;
X
X    if ( defined $opt_output && $opt_output ne '' ) {
X	open (STDOUT, ">$opt_output");
X    }
X
X    if ( $opt_analyze ) {
X	$opt_analyze = 2 unless $opt_short;
X	open (STDERR, ">&STDOUT");
X	if ( -t STDERR ) {
X	    select (STDERR);
X	    $| = 1;
X	}
X    }
X
X    $opt_header = !$opt_noheader;
X    print STDERR ("This is $my_package [$my_name $my_version]") if $opt_ident;
X}
X
Xsub fetch {
X
X    # Fetch the next opcode from the input.
X    #
X    # $opc_type will designate the type of opcode:
X    #   0 -> sequence of ASCII characters, delivered in $opc_char.
X    #   1 -> sequence of bytes, delevired in @opc.
X    # $opc_len will hold the number of bytes involved.
X    #
X    # Return value is $opc_type, or -1 if exhausted.
X
X    return -1 if $wpd eq '';
X
X    if ( $wpd =~ /^[ -~]+/ ) {
X	$opc_char = $&;
X	$wpd = $';
X	$opc_len = length ($opc_char);
X	$opc_type = 0;
X	printf STDERR ("%04x: %s\n", $wpdptr, $opc_char)
X	    if $opt_debug || $opt_analyze;
X	$wpdptr += $opc_len;
X	return $opc_type;
X    }
X
X    $opc = ord ($wpd);
X    $opc_type = 1;
X    $opc_len = -1;
X       if ( $opc <= 037 || $opc == 0x7f )	{ $opc_len = 1; }
X    elsif ( $opc >= 0x80 && $opc <= 0xbf )	{ $opc_len = 1; }
X    elsif ( $opc == 0xc0 )			{ $opc_len = 4; }
X    elsif ( $opc == 0xc1 )			{ $opc_len = 9; }
X    elsif ( $opc == 0xc2 )			{ $opc_len = 11; }
X    elsif ( $opc == 0xc3 || $opc == 0xc4 )	{ $opc_len = 3; }
X    elsif ( $opc == 0xc5 )			{ $opc_len = 5; }
X    elsif ( $opc == 0xc6 )			{ $opc_len = 6; }
X    elsif ( $opc == 0xc7 )			{ $opc_len = 7; }
X    elsif ( $opc >= 0xc8 && $opc <= 0xcf ) {
X	printf STDERR ("%04x: [Reserved opcode %02x]\n", $wpdptr, $opc);
X	$opc_len = 1;
X    }
X
X    if ( $opc_len > 0 ) {
X	@opc = unpack ("C*", substr ($wpd, 0, $opc_len));
X	substr ($wpd, 0, $opc_len) = '';
X	if ( $opt_debug || $opt_analyze ) {
X	    printf STDERR ("%04x: [" . join(" ", ("%02x") x $opc_len) . "]\n",
X			   $wpdptr, @opc);
X	}
X	die ("*** PHASE ERROR ***\n") unless $opc == $opc[$#opc];
X	$wpdptr += $opc_len;
X	return $opc_type;
X    }
X
X    $opc_len = unpack ("v", substr ($wpd, 2, 2));
X    @opc = unpack ("C*", substr ($wpd, 0, $opc_len+4));
X    substr ($wpd, 0, $opc_len+4) = '';
X    if ( $opt_debug || $opt_analyze ) {
X	local (@opcb) = @opc;
X	local (@finals) = splice (@opcb, $#opcb-3);
X	printf STDERR ("%04x: [%02x %02x %02x%02x (%d)", $wpdptr,
X		       splice (@opcb, 0, 4), $opc_len);
X	if ( $opt_analyze == 1 && @opcb > 6 ) {
X	    printf STDERR (" %02x %02x ... %02x %02x", 
X			   $opcb[0], $opcb[1], $opcb[$#opcb-1], $opcb[$#opcb]);
X	}
X	else {
X	    foreach $b ( @opcb ) {
X		printf STDERR (" %02x", $b);
X	    }
X	}
X	printf STDERR (" %02x%02x %02x %02x]\n", @finals);
X    }
X    die ("*** OPCODE LENGHT ERROR ***\n")
X	unless ($opc[$#opc-3] | ($opc[$#opc-2] << 8)) == $opc_len;
X    $wpdptr += $opc_len += 4;
X    die ("*** OPCODE SUBFUN ERROR ***\n") unless $opc[1] == $opc[$#opc-1];
X    die ("*** OPCODE ERROR ***\n") unless $opc == $opc[$#opc];
X    return $opc_type;
X}
SHAR_EOF
chmod 0644 wpscan.pl ||
echo 'restore of wpscan.pl failed'
Wc_c="`wc -c < 'wpscan.pl'`"
test 6374 -eq "$Wc_c" ||
	echo 'wpscan.pl: original size 6374, current size' "$Wc_c"
fi
exit 0
--
Johan Vromans				       jv@mh.nl via internet backbones
Multihouse Automatisering bv		       uucp:..!{uunet,sun4nl}!mh.nl!jv
Doesburgweg 7, 2803 PL Gouda, The Netherlands  phone/fax: +31 1820 62911/62500
------------------------ "Arms are made for hugging" -------------------------


