/* ----------------------------------------------------------------- */ lastmod='1996-09-21' /* 96-09-21 or rework follow href= error */ /* 96-09-11 or tag due to etraas@te.xs4all.nl */ /* 96-08-21 or rework tag error */ /* 96-08-03 or rework
tag due to "Kirchner Soft" */ /* 96-08-02 or rework
tag due to pinkas@en.com */ /* 96-07-16 or follow href= */ /* 96-06-29 or rework */ /* 96-04-17 or try to support
*/ /* 96-01-15 or reworked */ /* 95-07-10 or decode HTML files */ /* --------------------------------------------------------- */ /* call: htm2txt infile [l nn [p nn [e editor [u */ /* output: infile-name.TXT */ /* */ /* recognised tags: */ /* */ /* all tags as supported by ibm webex 1.1b */ /* */ /* special tags: */ /* */ /* trace '?i' */ /* exit immediately */ /* */ /* recognised substitute variables see variable 'consts' */ /* tab-char ' ' will be ignored */ /* */ /* notes: */ /* */ /* all tags are converted as 'best fit'. */ /* the image a browser produces will not be met. */ /* */ /* --------------------------------------------------------- */ /* variables to be customized */ /* */ linemax=72 /* maximum line length */ pixlbyt= 6 /* pixels per byte for tables */ editor ='e' /* editor for output file */ /* --------------------------------------------------------- */ /* constants as known of today */ /* */ consts= "&space '20'x" consts=consts "  '20'x" consts=consts "ß " consts=consts "ä " consts=consts "Ä " consts=consts "ö " consts=consts "Ö " consts=consts "ü " consts=consts "Ü " consts=consts "á " consts=consts "é " consts=consts "í " consts=consts " '20'x" consts=consts ": :" consts=consts "< <" consts=consts "> >" consts=consts "[ [" consts=consts "] ]" consts=consts "’ '" /* */ consts=consts '" "' consts=consts '‘ "' /* */ consts=consts '“ "' /* */ consts=consts '” "' /* */ consts=consts "  'a0'x" consts=consts "° " /* EBCDIC !!! */ consts=consts "¹ '" /* */ consts=consts "&mdash -" consts=consts "< <" consts=consts "> >" consts=consts "& '00'x" consts=consts "© '20'x" /* --------------------------------------------------------- */ /* check input parameters */ /* */ parse upper arg ifiname options if ifiname='' then exit 4 parse var ifiname fn '.' ext if ext='' then ifiname=ifiname'.HTM' ofiname = fn'.TXT' /* --------------------------------------------------------- */ /* check for options */ /* */ swi_url=0 do while options \= '' parse upper var options opt val options select when opt='L' then linemax=val when opt='P' then pixlbyt=val when opt='E' then editor =val when opt='U' then do swi_url=1 options=val options end otherwise nop end end /* --------------------------------------------------------- */ /* activate debug facilities */ /* */ /* signal on syntax signal on error signal on failure signal on halt */ /* --------------------------------------------------------- */ /* some global controls */ /* */ hrf.0=1 /* href-control */ hrf.1=ifiname 'erase' ofiname '2>NUL' call lineout ofiname,'HTM2TXT v.' lastmod call lineout ofiname,' ' call lineout ofiname,'Extracted from' ifiname',' date()',' left(time(),5) call lineout ofiname,' ' do nexthrf=1 while hrf.0>=nexthrf call process_file hrf.nexthrf end call lineout ofiname say if editor \= '' then 'start /F' editor ofiname /* <=== edit result */ /* --------------------------------------------------------- */ exit 0 /* --------------------------------------------------------- */ /* process a file */ /* */ process_file: parse arg ifiname /* --------------------------------------------------------- */ /* read infile */ /* */ nl ='0d'x /* new line character */ ifi='' say say 'reading' ifiname call stream ifiname,'c','close' do i=1 while chars(ifiname)>1 l=linein(ifiname)||nl l=translate(l,' ','09'x) ifi = ifi||l end call stream ifiname,'c','close' say i-1 'records read from' ifiname /* --------------------------------------------------------- */ /* format outfile lines */ /* */ ofi.0=0 /* out file controls */ dlspaces ='' /*
-spaces */ lispaces ='' /*
  • -spaces */ indents =0 /* number of indents */ blanklines=0 /* number of blank lines */ linelen =linemax /* max. linelength */ outtext ='' /* initial text */ /* switches: */ swi_pre = 0 /* switch PRE */ swi_tbl = 0 /* switch table definition */ swi_lst = 0 /* switch list definition */ swi_cnt = 0 /* switch center text */ swi_cat = 0 /* switch concatenate */ swi_trc = 0 /* switch trace */ /* --------------------------------------------------------- */ /* scan input stream */ /* */ call charout ,'processing token ' text='' do count=1 while length(ifi)>0 call charout ,format(count,5) copies('08'x,6) if swi_trc then trace 'i' /* check next line */ parse var ifi parttext '<' tag '>' ifi /* process text */ select when swi_pre then call process_preformatted when strip(parttext)=nl then nop otherwise do do while pos(nl,parttext)>0 parse var parttext a (nl) b parttext=strip(a) strip(b) end if swi_cat then text=text||parttext else do if text='' then text= parttext else text=text parttext end end end /* process tag */ tag=translate(tag,' ',nl) if left(tag,1)='!' then tag='!' substr(tag,2) parse var tag tag options tag=translate(tag) swi_cat=0 select when tag='TRACE' then swi_trc=1 when tag='EXIT' then signal finish when tag='!' then call out '***' options '***' when tag='FONT' then swi_cat=1 when tag='UL', | tag='OL', | tag='DL', | tag='DIR', | tag='MENU', then do call out text call out ' ' lispaces=' * ' indents=indents+1 swi_lst=1 end when tag='LI' then call out text when tag='DT' then do call out text lispaces=' * ' if indents>0 then indents=indents-1 end when tag='DD' then do call out text lispaces=' ' indents=indents+1 end when tag='/UL', | tag='/OL', | tag='/DL', | tag='/DIR', | tag='/MENU', then do call out text lispaces='' if indents>0 then indents=indents-1 call out ' ' swi_lst=0 end when tag='CENTER', | tag='CENTRE', then swi_cnt=1 when tag='/CENTER', | tag='/CENTRE', then do swi_cnt=0 call out text end when tag='P', | tag='/TITLE', | tag='/CENTER', | tag='/CENTRE', then call out text when tag='/HEAD', then do call out text call out ' ' end when tag='PRE' then do swi_pre=1 linelen=parmval('WIDTH',options) end when tag='/PRE' then do swi_pre=0 linelen=linemax end when tag='HR' then call out copies('-',linelen) when tag='H1', | tag='H2', | tag='H3', | tag='H4', | tag='/H1', | tag='/H2', | tag='/H3', | tag='/H4', | tag='/CAPTION', then do call out text call out ' ' end when tag='A' then do parse upper var options 'HREF' . '"' hrefid '"' nogo= pos('#',hrefid)>0 srefid='' if swi_url, & \nogo then do srefid=hrefid end parse var hrefid z '.' fext nogo=nogo|(left(fext,3)\='HTM') parse var hrefid z 'FILE:' hrefid if hrefid='' then hrefid=z nogo=nogo|(strip(hrefid)='') do i=1 to hrf.0 if hrf.i=hrefid then leave end if (i>hrf.0)&(\nogo) then do hrf.0=hrf.0+1; z=hrf.0; hrf.z=hrefid end end when tag='/A' then do if swi_url, & srefid\='' then do text=text '('srefid')' srefid='' end end /* when tag='IMG' then do z=parmval('ALT',options) if z\=0 then do if swi_tbl then do text=z call save_table_text end else text=text z end end */ when tag='TABLE' then do call out text call out ' ' swi_tbl=1 swi_wid=1 tbwid. =0 end when tag='TR' then do tbcol=0 tbmax=0 drop tbtxt. end when tag='TD' then do /* determine next column */ z=parmval('COLSTART',options) if z=0 then tbcol=tbcol+1 else tbcol=z if tbmax0 then do select when right(p,3)='PIX' then do parse var p n 'PIX' . tbwid.tbcol=n%pixlbyt end when right(p,1)='%' then do parse var p n '%' . tbwid.tbcol=(n*linelen)%100 end otherwise if p>linemax then p=linemax tbwid.tbcol=p end end /* set lines/col to 0 */ tblin.tbcol=0 end when tag='/TD' then do if swi_tbl then call save_table_text end when tag='/TR' then do if swi_tbl then do /* col-width already done ? */ if swi_wid then do swi_wid=0 /* check predefined col-width */ colwi=0 do i=1 to tbmax colwi=colwi+tbwid.i end linelen=linemax-colwi if linelen<=0 then linelen=linemax /* set col-width if not set */ do i=1 to tbmax if tbwid.i>0 then iterate tbwid.i=linelen%tbmax end linelen=linemax /* check sum colwid exceeds */ sum_col=0 do i=1 to tbmax sum_col=sum_col+tbwid.i end if sum_col>linemax then do ratio=linemax/sum_col do i=1 to tbmax tbwid.i=trunc(tbwid.i/ratio) end end end /* get max nr. lines in row */ lnmax=1 do i=1 to tbmax if lnmaxtbwid.k , & tbwid.k>0 then do z=lastpos(' ',tbtxt.k.y,tbwid.k) if z=0 then do /* give up */ otext=tbtxt.k.y tbtxt.k.y='' end else do /* split text */ otext=left(tbtxt.k.y,z) tbtxt.k.y=substr(tbtxt.k.y,z) anytxt=1 end end else do otext=tbtxt.k.y tbtxt.k.y='' end tbtxt.1.y='_' /* build output line */ text=text left(otext,tbwid.k) end /* all cols processed */ call out_table_text end end end end when tag='/TABLE' then do blanklines=0 call out ' ' swi_tbl=0 end when tag='BR' then do if swi_lst then call out text if swi_tbl , & (tbmax>1) then call save_table_text else call out text end otherwise nop end /* all finished */ end /* --------------------------------------------------------- */ /* write outfile */ /* */ finish: say do i=1 to ofi.0 call lineout ofiname,ofi.i end /* --------------------------------------------------------- */ return /* ========================================================= */ /* --------------------------------------------------------- */ /* save table-text */ /* */ save_table_text: if strip(text)='' then return tblin.tbcol=tblin.tbcol+1 z=tblin.tbcol tbtxt.tbcol.z=text text='' return /* --------------------------------------------------------- */ /* out table-text */ /* */ out_table_text: text = strip(text) if text ='' then return if text \= '_' then call o text text = '' return /* --------------------------------------------------------- */ /* process preformatted */ /* */ process_preformatted: do while length(parttext)>0 parse var parttext outtext (nl) parttext call out outtext end return /* --------------------------------------------------------- */ /* extract parameter values */ /* */ parmval: procedure; parse upper arg key,string z=pos(key,string) if z=0 then return 0 string=substr(string,z) parse var string '=' val . val=translate(val,' ','"') val=translate(strip(val)) return val /* --------------------------------------------------------- */ /* do output lines */ /* */ out: oli=subs(arg(1)) oll=length(oli) /* do not output more than 1 blank line */ if oll=0 then do if blanklines>0 then return blanklines=blanklines+1 end if linelen>0 then do do while oll>linelen z=lastpos(' ',oli,linelen) if z=0 then z=oll if (z>0) then do call o left(oli,z) oli=strip(substr(oli,z+1)) oll=length(oli) end end end call o oli if oll>0 then blanklines=0 text='' return o: procedure expose swi_cnt linelen indents dlspaces lispaces ofi. parse arg ooo if swi_cnt then do z=(linelen-length(ooo))%2 if z>0 then prefix=copies(' ',z) else prefix='' end else do prefix=copies(' ',indents)||lispaces||dlspaces end ofi.0=ofi.0+1; z=ofi.0; ofi.z=prefix||ooo return /* --------------------------------------------------------- */ /* substitute constants */ /* */ subs: procedure expose consts; l = arg(1) /* check for tab chars */ l=translate(l,' ','09'x) /* check for variables */ z=pos('&',l) if z=0 then return strip(l) do while z > 0 head = left(l,z-1) token = substr(l,z) do i=1 to words(consts) by 2 a=word(consts,i) b=length(a) c=left(token,b) d=word(consts,i+1) if right(d,2)="'x" then interpret "d="d if c=a then do head=head||d token=substr(token,b+2) leave end end if i>words(consts) then do token='?'substr(token,2) end l = head||token z=pos('&',l) end return strip(translate(l,'&','00'x)) /* --------------------------------------------------------- */ syntax: say 'signal on syntax in' sigl':' strip(sourceline(sigl)) signal common_error error: say 'signal on error in' sigl':' strip(sourceline(sigl)) signal common_error failure: say 'signal on failure in' sigl':' strip(sourceline(sigl)) signal common_error halt: say 'signal on halt in' sigl':' strip(sourceline(sigl)) signal common_error common_error: trace '?i' do forever nop end /* --------------------------------------------------------- */