/* ----------------------------------------------------------------- */ lastmod='1997-02-19' /* ----------------------------------------------------------------- */ /* variables to be customized */ /* */ /* following variables may be set to permanent installation */ /* specific values. they may be temporary modified */ /* by command line options */ /* option */ /* ------ */ linemax=72 /* maximum line length l n */ pixlbyt= 6 /* pixels per byte for tables p n */ editor ='E' /* editor for output file e [..] */ chain ='Y' /* follow url-chain f n */ showu ='N' /* show url link adddress u */ ofile ='.TXT' /* derive outfilename from ifile o name */ /* ----------------------------------------------------------------- */ /* 97-02-19 or define out-file name due to Ralph_Ulrich@p31.lemmi.ftg.donut.de */ /* 97-02-19 or switch off chaining due to jblumel@gs.net */ /* 97-02-06 or accept missing tags , */ /* 97-01-25 or substitute &#... tokens, correction */ /* 96-12-31 or
 correction                                      */
/* 96-12-18 or /FONT correction                                      */
/* 96-11-17 or list of &constants    due to tremro@digicom.qc.ca     */
/* 96-11-07 or problem with nested tables                            */
/* 96-10-22 or filenames drag-drop            due to sahag@ibm.net   */
/* 96-10-21 or filenames with wildcard *      due to sahag@ibm.net   */
/* 96-10-19 or filenames with embedded blanks due to sahag@ibm.net   */
/* 96-10-17 or rework width=    due to lconyers@postmaster2.dot.gov  */
/* 96-09-21 or rework follow href= error                             */
/* 96-09-11 or  tag    due to etraas@te.xs4all.nl             */
/* 96-08-21 or rework 
tag error */ /* 96-08-03 or rework
tag due to "Kirchner Soft" */ /* 96-08-02 or rework
tag due to pinkas@en.com */ /* 96-07-16 or follow href= */ /* 96-06-29 or rework */ /* 96-04-17 or try to support
*/ /* 96-01-15 or reworked */ /* 95-07-10 or decode HTML files */ /* ----------------------------------------------------------------- */ /* call: htm2txt infile [l nn [p nn [e editor [u [f n [o xxx */ /* output: infile-name.TXT */ /* */ /* recognised tags: */ /* */ /* all tags as supported by ibm webex 1.1b */ /* */ /* special tags: */ /* */ /* trace '?i' */ /* exit immediately */ /* */ /* recognised substitute variables see variable 'consts' */ /* tab-char ' ' will be ignored */ /* */ /* notes: */ /* */ /* all tags are converted as 'best fit'. */ /* the image a browser produces will not be met. */ /* */ /* --------------------------------------------------------- */ /* constants contributed by tremro@digicom.qc.ca */ /* */ consts= "space '20'x" consts=consts "#32 '20'x" consts=consts "quot '22'x" consts=consts "#34 '22'x" consts=consts "amp '00'x" consts=consts "#38 '00'x" consts=consts "#39 '27'x" consts=consts "#58 '3a'x" consts=consts "#60 <" consts=consts "lt <" consts=consts "#62 >" consts=consts "gt >" consts=consts "#91 [" consts=consts "#93 ]" consts=consts "nbsp '20'x" consts=consts "#160 '20'x" consts=consts "iexcl 'a1'x" consts=consts "cent 'a2'x" consts=consts "pound 'a3'x" consts=consts "curren 'a4'x" consts=consts "yen 'a5'x" consts=consts "brvbar 'a6'x" consts=consts "sect 'a7'x" consts=consts "uml 'a8'x" consts=consts "copy 'a9'x" consts=consts "ordf 'aa'x" consts=consts "laqno 'ab'x" consts=consts "not 'ac'x" consts=consts "shy 'ad'x" consts=consts "reg 'ae'x" consts=consts "hibar 'af'x" consts=consts "deg 'b0'x" consts=consts "plusmn 'b1'x" consts=consts "sup2 'b2'x" consts=consts "sup3 'b3'x" consts=consts "acute 'b4'x" consts=consts "micro 'b4'x" consts=consts "para 'b6'x" consts=consts "middot 'b7'x" consts=consts "cedil 'b8'x" consts=consts "sup1 'b9'x" consts=consts "ordm 'ba'x" consts=consts "raquo 'bb'x" consts=consts "frac14 'bc'x" consts=consts "frac12 'bd'x" consts=consts "frac34 'be'x" consts=consts "iquest 'bf'x" consts=consts "Agrave 'c0'x" consts=consts "Aacute 'c1'x" consts=consts "Acirc 'c2'x" consts=consts "Atilde 'c3'x" consts=consts "Auml 'c4'x" consts=consts "Aring 'c5'x" consts=consts "AElig 'c6'x" consts=consts "Ccedil 'c7'x" consts=consts "Egrave 'c8'x" consts=consts "Eacute 'c9'x" consts=consts "Ecirc 'ca'x" consts=consts "Euml 'cb'x" consts=consts "Igrave 'cc'x" consts=consts "Iacute 'cd'x" consts=consts "Icirc 'ce'x" consts=consts "Iuml 'cf'x" consts=consts "ETH 'd0'x" consts=consts "Ntilde 'd1'x" consts=consts "Ograve 'd2'x" consts=consts "Oacute 'd3'x" consts=consts "Ocirc 'd4'x" consts=consts "Otilde 'd5'x" consts=consts "Ouml 'd6'x" consts=consts "times 'd7'x" consts=consts "Oslash 'd8'x" consts=consts "Ugrave 'd9'x" consts=consts "Uacute 'da'x" consts=consts "Ucirc 'db'x" consts=consts "Uuml 'dc'x" consts=consts "Yacute 'dd'x" consts=consts "THORN 'de'x" consts=consts "szlig 'df'x" consts=consts "agrave 'e0'x" consts=consts "aacute 'e1'x" consts=consts "acirc 'e2'x" consts=consts "atilde 'e3'x" consts=consts "auml 'e4'x" consts=consts "aring 'e5'x" consts=consts "aelig 'e6'x" consts=consts "ccedil 'e7'x" consts=consts "egrave 'e8'x" consts=consts "eacute 'e9'x" consts=consts "ecirc 'ea'x" consts=consts "euml 'eb'x" consts=consts "igrave 'ec'x" consts=consts "iacute 'ed'x" consts=consts "icirc 'ee'x" consts=consts "iuml 'ef'x" consts=consts "eth 'f0'x" consts=consts "ntilde 'f1'x" consts=consts "ograve 'f2'x" consts=consts "oacute 'f3'x" consts=consts "ocirc 'f4'x" consts=consts "otilde 'f5'x" consts=consts "ouml 'f6'x" consts=consts "divide 'f7'x" consts=consts "oslash 'f8'x" consts=consts "ugrave 'f9'x" consts=consts "uacute 'fa'x" consts=consts "ucirc 'fb'x" consts=consts "uuml 'fc'x" consts=consts "yacute 'fd'x" consts=consts "thorn 'fe'x" consts=consts "yuml 'ff'x" /* --------------------------------------------------------- */ /* check input parameters */ /* */ if arg(1)='' then do say say 'correct call is:' say say ' htm2txt infilename [options ' say ' infilename = fully qualified path' say ' may contain wildcard *' say ' options (any order)' say ' l nn max nn chars in a line' say ' p nn max nn pixels per byte' say ' e editor name' say ' o outfile name' say ' f n do not follow url-chain' say ' u show anchor url''s' say exit 4 end parse arg arg arg=strip(translate(arg,' ','"')) z=pos('.',arg) if z=0 then do parse var arg ifiname options ifiname=ifiname'.HTM' end else do parse var arg ifiname '.' ifext options ifiname=ifiname'.'ifext end /* --------------------------------------------------------- */ /* check for wildcard char in ifiname */ /* */ wcd.0=1; wcd.1=ifiname swi_wcd = pos('*',ifiname)>0 if swi_wcd then do if \RxFuncAdd('SysLoadFuncs','RexxUtil','SysLoadFuncs') then call 'SysLoadFuncs' call SysFileTree ifiname,'WCD.','FO' end /* --------------------------------------------------------- */ /* check for options */ /* */ swi_url=0 swi_dbg=0 do while options \= '' parse upper var options opt val options select when opt='L' then linemax=val when opt='P' then pixlbyt=val when opt='E' then editor =val when opt='F' then chain =val when opt='O' then ofile =val when opt='U' then do; showu ='Y'; options=val options; end when opt='D' then do; swi_dbg=1; options=val options; end otherwise nop end end if translate(showu)='N' then swi_url=0; else swi_url=1 if translate(chain)='Y' then swi_chn=1; else swi_chn=0 /* --------------------------------------------------------- */ /* activate debug facilities */ /* */ if swi_dbg then do say 'debug active' signal on syntax signal on error signal on failure signal on halt end /* --------------------------------------------------------- */ /* delete output file */ /* */ if ofile='.TXT' then do parse var ifiname ofiname '.' . ofiname=ofiname'.TXT' end else ofiname=ofile if swi_dbg then signal off error 'erase' '"'ofiname'"' '2>NUL' if swi_dbg then signal on error /* --------------------------------------------------------- */ /* loop on file-list */ /* */ call time 'r' do wcd = 1 to wcd.0 ifiname=wcd.wcd /* --------------------------------------------------------- */ /* some global controls */ /* */ hrf.0=1 /* href-control */ hrf.1=ifiname call lineout ofiname,'HTM2TXT v.' lastmod call lineout ofiname,' ' call lineout ofiname,'Extracted from' ifiname',' date()',' left(time(),5) call lineout ofiname,' ' do nexthrf=1 while hrf.0>=nexthrf call process_file hrf.nexthrf end /* --------------------------------------------------------- */ end /* end wildcard loop */ call lineout ofiname /* --------------------------------------------------------- */ /* edit result */ swi_edt=(editor\='') if swi_wcd then if wcd>1 then swi_edt=0 if swi_edt then 'start /F' editor '"'ofiname'"' /* --------------------------------------------------------- */ laps=time('e')%1 min=laps%60 sec=laps//60 say 'finished' min':'right(sec,2,0) 'min' exit 0 /* --------------------------------------------------------- */ /* process a file */ /* */ process_file: parse arg ifiname /* --------------------------------------------------------- */ /* read infile */ /* */ nl ='0d'x /* new line character */ ifi='' say say 'reading' ifiname /* --------------------------------------------------------- */ /* mod due to Ralph_Ulrich@p31.lemmi.ftg.donut.de */ /* */ call charin ifiname,1,0 ifiname_LEN = chars(ifiname) ifi = charin( ifiname,1,ifiname_LEN) call stream ifiname,'c','close' say ifiname_LEN 'Bytes read from' ifiname ifi=translate(ifi,' ','090A'x) /* --------------------------------------------------------- */ /* format outfile lines */ /* */ ofi.0=0 /* out file controls */ dlspaces ='' /*
-spaces */ lispaces ='' /*
  • -spaces */ indents =0 /* number of indents */ blanklines=0 /* number of blank lines */ linelen =linemax /* max. linelength */ outtext ='' /* initial text */ /* switches: */ swi_pre = 0 /* switch PRE */ swi_tbl = 0 /* switch table */ swi_tr = 0 /* switch table row active */ swi_td = 0 /* switch def/hdr active */ swi_wid = 1 /* switch calc.col width */ swi_lst = 0 /* switch list definition */ swi_cnt = 0 /* switch center text */ swi_cat = 0 /* switch concatenate */ swi_trc = 0 /* switch trace */ cnt_tbl = 0 /* count nested tables */ /* --------------------------------------------------------- */ /* scan input stream */ /* */ call charout ,'processing token ' text='' count=0 do while length(ifi)>0 if swi_trc then do; interpret 'trace' tracetag; swi_dbg=1; end /* check next line */ parse var ifi parttext '<' tag '>' ifi if pos('<',tag)>0 then do parse var tag tag '<' rest ifi='<'rest'>'ifi end /* process text */ select when swi_pre then call process_preformatted when strip(parttext)=nl then nop otherwise do do while pos(nl,parttext)>0 parse var parttext a (nl) b parttext=strip(a) strip(b) end if swi_cat then text=text||parttext else do if text='' then text= parttext else text=text parttext end end end /* process tag */ tag=translate(tag,' ',nl) if left(tag,1)='!' then tag='!' substr(tag,2) parse var tag tag options tag=translate(tag) if tag='TRACE' then do swi_trc=1 if pos('?',options)>0 then tracetag='?i' else tracetag=' i' end swi_cat=0 count=count+1 if \swi_dbg then call charout , copies('08'x,6)||format(count,5)' ' select when swi_tbl then do select when tag='TR' then do if swi_td then call save_table_text if swi_tr then call end_row tabcol=0 drop tbtxt. end when tag='TD' , | tag='TH' then do if swi_td then call save_table_text swi_tr=1 swi_td=1 /* determine next column */ z=parmval('COLSTART',options) if z=0 then tabcol=tabcol+1 else tabcol=z if colmax0 & \datatype(p,'NUM') then do z=verify(p,'1234567890'); n=0 if z>0 then do n=substr(p,z,1) q=left(p,z-1) end select when n='P' then do tbwid.tabcol.0=q%pixlbyt end when n='%' then do tbwid.tabcol.0=(q*linelen)%100 end otherwise if q>linemax then q=linemax tbwid.tabcol.0=q end end if p>0 & datatype(p,'NUM') then do tbwid.tabcol.0=p end end when tag='/TD' , | tag='/TH' then call save_table_text when tag='/TR' then call end_row when tag='TABLE' then do if swi_td then call save_table_text if swi_tr then call end_row cnt_tbl=cnt_tbl+1 end when tag='/TABLE' then do if swi_td then call save_table_text if swi_tr then call end_row blanklines=0 call out ' ' cnt_tbl=cnt_tbl-1 swi_tbl=(cnt_tbl>0) end when tag='BR' then do if colmax>1 then call save_table_text else call out text end otherwise nop end end when tag='TABLE' then do call out text blanklines=0 call out ' ' swi_tbl=1 swi_wid=1 cnt_tbl=cnt_tbl+1 swi_cnt=0 tbwid. =0 tblin. =0 tabcol =0 colmax =0 end when tag='EXIT' then signal finish when tag='!' then call out '***' options '***' when tag='FONT', | tag='/FONT' then swi_cat=1 when tag='UL', | tag='OL', | tag='DL', | tag='DIR', | tag='MENU', then do call out text call out ' ' if lispaces='' then lispaces=' * ' else lispaces=' 'lispaces indents=indents+1 swi_lst=1 end when tag='LI' then call out text when tag='DT' then do call out text dlspaces=' ' end when tag='DD' then do call out text dlspaces=' ' end when tag='/UL', | tag='/OL', | tag='/DL', | tag='/DIR', | tag='/MENU', then do call out text dlspaces='' lispaces=substr(lispaces,4) if indents>0 then indents=indents-1 call out ' ' swi_lst=0 end when tag='CENTER', | tag='CENTRE', then do swi_cnt=1 end when tag='/CENTER', | tag='/CENTRE', then do swi_cnt=0 call out text end when tag='P', | tag='/TITLE', then call out text when tag='/HEAD', then do call out text call out ' ' end when tag='PRE' then do swi_pre=1 linelen=parmval('WIDTH',options) end when tag='/PRE' then do swi_pre=0 linelen=linemax end when tag='HR' then do call out text call out copies('-',linelen) end when tag='H1', | tag='H2', | tag='H3', | tag='H4', | tag='/H1', | tag='/H2', | tag='/H3', | tag='/H4', | tag='/CAPTION', then do call out text call out ' ' end when tag='A' then do parse upper var options 'HREF' . '"' hrefid '"' nogo= pos('#',hrefid)>0 srefid='' if swi_url, & \nogo then do srefid=hrefid end parse var hrefid z '.' fext nogo=nogo|(left(fext,3)\='HTM') parse var hrefid z 'FILE:' hrefid if hrefid='' then hrefid=z nogo=nogo|(strip(hrefid)='') do i=1 to hrf.0 if hrf.i=hrefid then leave end if (i>hrf.0)&(\nogo)&(swi_chn) then do hrf.0=hrf.0+1; z=hrf.0; hrf.z=hrefid end end when tag='/A' then do if swi_url, & srefid\='' then do text=text '('srefid')' srefid='' end end /* when tag='IMG' then do z=parmval('ALT',options) if z\=0 then do if swi_tbl then do text=z call save_table_text end else text=text z end end */ when tag='BR' then call out text otherwise nop end /* all finished */ end /* --------------------------------------------------------- */ /* write outfile */ /* */ finish: do i=1 to ofi.0 call lineout ofiname,ofi.i end return /* ========================================================= */ /* --------------------------------------------------------- */ /* close table row */ end_row: swi_tr=0 swi_td=0 /* col-width already done ? */ if swi_wid then do /* check predefined col-width */ colwi=0 do i=1 to colmax if tbwid.i.0>0 then tbwid.i=tbwid.i.0 else tbwid.i=0 colwi=colwi+tbwid.i end linelen=linemax-colwi if linelen<=0 then linelen=linemax /* set col-width if not set */ do i=1 to colmax if tbwid.i>0 then iterate tbwid.i=linelen%colmax end linelen=linemax /* check sum colwid exceeds */ sum_col=0 do i=1 to colmax sum_col=sum_col+tbwid.i end if sum_col>linemax then do ratio=linemax/sum_col do i=1 to colmax tbwid.i=trunc(tbwid.i/ratio) end end end swi_wid=0 /* get max nr. lines in row */ linmax=1 do i=1 to colmax if linmaxtbwid.k , & tbwid.k>0 then do z=lastpos(' ',tbtxt.k.y,tbwid.k) if z=0 then z=tbwid.k otext=left(tbtxt.k.y,z) /* split text */ tbtxt.k.y=strip(substr(tbtxt.k.y,z)) anytxt=1 end else do otext=tbtxt.k.y tbtxt.k.y='' end if tbtxt.1.y='' then tbtxt.1.y='_' /* build output line */ text=text left(otext,tbwid.k) end /* all cols processed */ call out_table_text end end tblin.=0 return /* --------------------------------------------------------- */ /* save table-text */ /* */ save_table_text: swi_td=0 if strip(text)\='' then do tblin.tabcol=tblin.tabcol+1 z=tblin.tabcol tbtxt.tabcol.z=text end text='' return /* --------------------------------------------------------- */ /* out table-text */ /* */ out_table_text: text = strip(text) if text ='' then return if text \= '_' then call o text text = '' return /* --------------------------------------------------------- */ /* process preformatted */ /* */ process_preformatted: do while length(parttext)>0 parse var parttext outtext (nl) parttext oli=subs(outtext) ofi.0=ofi.0+1; z=ofi.0; ofi.z=outtext end return /* --------------------------------------------------------- */ /* extract parameter values */ /* */ parmval: procedure; parse upper arg key,string z=pos(key,string) if z=0 then return 0 string=substr(string,z) parse var string '=' val . val=translate(val,' ','"') val=translate(strip(val)) return val /* --------------------------------------------------------- */ /* do output lines */ /* */ out: oli=subs(arg(1)) oll=length(oli) /* do not output more than 1 blank line */ if oll=0 then do if blanklines>0 then return blanklines=blanklines+1 end if linelen>0 then do do while oll>linelen z=lastpos(' ',oli,linelen) if z=0 then z=oll if (z>0) then do call o left(oli,z) oli=strip(substr(oli,z+1)) oll=length(oli) end end end call o oli if oll>0 then blanklines=0 text='' return o: procedure expose swi_cnt linelen indents dlspaces lispaces ofi. parse arg ooo if swi_cnt then do z=(linelen-length(ooo))%2 if z>0 then prefix=copies(' ',z) else prefix='' end else do prefix=copies(' ',indents)||lispaces||dlspaces end ofi.0=ofi.0+1; z=ofi.0; ofi.z=prefix||ooo return /* --------------------------------------------------------- */ /* substitute constants */ /* */ subs: procedure expose consts count l=arg(1) /* check for tab chars */ l=translate(l,' ','09'x) /* check for variables */ z=pos('&',l) do while z > 0 parse var l head '&' token ';' tail w=wordpos(token,consts) if w=0 then do if (left(token,1)='#')&(datatype(token,'NUM')) then do token=substr(token,2) token=d2c(token) end else do token='?'token';' end end else do token=word(consts,w+1) if right(token,2)="'x" then interpret "token="token end l=head||token||tail z=pos('&',l) end return strip(translate(l,'&','00'x)) /* --------------------------------------------------------- */ syntax: say 'signal on syntax in' sigl':' strip(sourceline(sigl)) signal common_error error: say 'signal on error in' sigl':' strip(sourceline(sigl)) signal common_error failure: say 'signal on failure in' sigl':' strip(sourceline(sigl)) signal common_error halt: say 'signal on halt in' sigl':' strip(sourceline(sigl)) signal common_error common_error: trace '?i' do forever nop end /* --------------------------------------------------------- */