I dag bruker den norske stavekontrollen bindestrek (-) som markør for sammensatte ord. Denne markøren er viktig for å få "komprimeringen" til ispell, aspell og myspell til å fungere tilfredstillende. Et kritisk problem med bruk av bindestrek som slik markør er at det gjør det umulig å representere norske ord som skal ha bindestrek i stavekontrollen. Et eksempel er CD-spiller.
En ide for å fikse dette er å endre markørtegnet, til f.eks. likhetstegn (=) som å bruke samme tegn som på no.speling.org, og jeg har forsøkt to ganger nå å få dette til. En kompliserende faktor for meg er at byggsystemet til den norske stavekontrollen er veldig komplekst, og jeg forstår ikke fullt ut hvordan det fungerer.
Mitt siste forsøk ser ut til å fungere bedre enn sist, så jeg poster derfor patchen her i håp om at noen andre kan ta en titt og finne eventuelle problemer med den, før vi tar den i bruk. Jeg tror det er viktig å sikre at stavekontrollresultatene blir like med og uten denne patchen, før vi går videre og legger inn ord med bindestrek.
I tillegg til denne patchen av byggesystemet, så må alle bindestrek i ordene i norsk.words endres til likhetstegn.
Her er patchen relativ til dagens CVS-utgave av koden.
Index: Makefile =================================================================== RCS file: /cvsroot/spell-norwegian/src/spell-norwegian/Makefile,v retrieving revision 1.111 diff -u -3 -p -u -r1.111 Makefile --- Makefile 2 Nov 2006 23:08:00 -0000 1.111 +++ Makefile 11 Nov 2006 10:47:36 -0000 @@ -111,10 +111,10 @@ SHELLDEBUG = +vx
# CATNOHEADER=$(SED) -e '/^#/ D' -e 's/[ ]*#.*//' ${LANGUAGE}.words CATNOHEADER=grep -v '^#' ${LANGUAGE}.words -ALPHASUBST=tr '-' 'î' -ALPHASUBSTSED=s/-/î/g -STREKSUBST=tr 'î' '-' -STREKSUBSTSED=s/î/-/g +ALPHASUBST=tr '=' 'î' +ALPHASUBSTSED=s/=/î/g +STREKSUBST=tr 'î' '=' +STREKSUBSTSED=s/î/=/g STREKREM=tr -d 'îÎ' STREKREMSED=s/[îÎ]//g
@@ -133,7 +133,7 @@ SUFF=${SUFFNORM}${SUFFCOMP} # The awk scripts below tells which words from in each category that # should be in the dictionary. The line
-# /^[-${LCH}]{4}[${SUFF}]/ {if ($$2>4) {print $$1,$$2}} +# /^[=${LCH}]{4}[${SUFF}]/ {if ($$2>4) {print $$1,$$2}}
# says that words with length 4 containing only lowercase letters with # frequency greater then 5 should be included. Edit the scripts as @@ -142,7 +142,7 @@ SUFF=${SUFFNORM}${SUFFCOMP} # to be a legal word if the word is short. `re' is legal!
# The CHOOSEFLAG script sets the limit for flag inclusion. Example: -# adgangs-tegn is a common word, but the form adgangstegnenes scores 0 +# adgangs=tegn is a common word, but the form adgangstegnenes scores 0 # on frequency. It will be excluded by the script below if you don't # change it.
@@ -209,10 +209,10 @@ COMPOUNDLIMIT=0 # Somewhere in the long pipe making the input file for buildhash, the # data looks like # -# gutte-drøm/ 17 -# gutte-drøm/A 18 -# gutte-drøm/E 14 -# gutte-drøm/G 7 +# gutte=drøm/ 17 +# gutte=drøm/A 18 +# gutte=drøm/E 14 +# gutte=drøm/G 7 # # thus the frequenzy indicator for each flag is availiable. Awk is # used to pick the flags we want, and the variable holding the program @@ -221,7 +221,7 @@ COMPOUNDLIMIT=0 # # Later in the pipe the data looks like # -# gutte-drøm/17A18E14G7 19 +# gutte=drøm/17A18E14G7 19 # # The second field (19) is the frequenzy indicator for all words # coming from the root gutt. So here we can throw away a root with @@ -269,29 +269,29 @@ CHOOSEROOTC=${DEFAULTROOTFILTER} # Samme
# define CHOOSEFLAGB # '//[ ${PRE}]/ {print $$1,$$2} \ -# /^[-${LCH}]{1,2}/[${SUFF}]/ {if ($$2>6) {print $$1,$$2}} \ -# /^[-${LCH}]{3}/[${SUFF}]/ {if ($$2>5) {print $$1,$$2}} \ -# /^[-${LCH}]{4}/[${SUFF}]/ {if ($$2>3) {print $$1,$$2}} \ -# /^[-${LCH}]{5,7}/[${SUFF}]/ {if ($$2>1) {print $$1,$$2}} \ -# /^[-${LCH}]{8,}/[${SUFF}]/ {if ($$2>=0) {print $$1,$$2}} \ -# /^[${UCH}][-${CH}]{1,2}/[${SUFF}]/ {if ($$2>4) {print $$1,$$2}} \ -# /^[${UCH}][-${CH}]{3}/[${SUFF}]/ {if ($$2>3) {print $$1,$$2}} \ -# /^[${UCH}][-${CH}]{4}/[${SUFF}]/ {if ($$2>2) {print $$1,$$2}} \ -# /^[${UCH}][-${CH}]{5,7}/[${SUFF}]/ {if ($$2>1) {print $$1,$$2}} \ -# /^[${UCH}][-${CH}]{8,}/[${SUFF}]/ {if ($$2>=0) {print $$1,$$2}}' +# /^[=${LCH}]{1,2}/[${SUFF}]/ {if ($$2>6) {print $$1,$$2}} \ +# /^[=${LCH}]{3}/[${SUFF}]/ {if ($$2>5) {print $$1,$$2}} \ +# /^[=${LCH}]{4}/[${SUFF}]/ {if ($$2>3) {print $$1,$$2}} \ +# /^[=${LCH}]{5,7}/[${SUFF}]/ {if ($$2>1) {print $$1,$$2}} \ +# /^[=${LCH}]{8,}/[${SUFF}]/ {if ($$2>=0) {print $$1,$$2}} \ +# /^[${UCH}][=${CH}]{1,2}/[${SUFF}]/ {if ($$2>4) {print $$1,$$2}} \ +# /^[${UCH}][=${CH}]{3}/[${SUFF}]/ {if ($$2>3) {print $$1,$$2}} \ +# /^[${UCH}][=${CH}]{4}/[${SUFF}]/ {if ($$2>2) {print $$1,$$2}} \ +# /^[${UCH}][=${CH}]{5,7}/[${SUFF}]/ {if ($$2>1) {print $$1,$$2}} \ +# /^[${UCH}][=${CH}]{8,}/[${SUFF}]/ {if ($$2>=0) {print $$1,$$2}}' # endef
# define CHOOSEROOTB -# '/^[-${LCH}]{1,2}// {if ($$2>8) {print $$1,$$2}} \ -# /^[-${LCH}]{3}// {if ($$2>6) {print $$1,$$2}} \ -# /^[-${LCH}]{4}// {if ($$2>5) {print $$1,$$2}} \ -# /^[-${LCH}]{5,7}// {if ($$2>2) {print $$1,$$2}} \ -# /^[-${LCH}]{8,}// {if ($$2>1) {print $$1,$$2}} \ -# /^[${UCH}][-${CH}]{1,2}// {if ($$2>8) {print $$1,$$2}} \ -# /^[${UCH}][-${CH}]{3}// {if ($$2>6) {print $$1,$$2}} \ -# /^[${UCH}][-${CH}]{4}// {if ($$2>3) {print $$1,$$2}} \ -# /^[${UCH}][-${CH}]{5,7}// {if ($$2>2) {print $$1,$$2}} \ -# /^[${UCH}][-${CH}]{8,}// {if ($$2>1) {print $$1,$$2}}' +# '/^[=${LCH}]{1,2}// {if ($$2>8) {print $$1,$$2}} \ +# /^[=${LCH}]{3}// {if ($$2>6) {print $$1,$$2}} \ +# /^[=${LCH}]{4}// {if ($$2>5) {print $$1,$$2}} \ +# /^[=${LCH}]{5,7}// {if ($$2>2) {print $$1,$$2}} \ +# /^[=${LCH}]{8,}// {if ($$2>1) {print $$1,$$2}} \ +# /^[${UCH}][=${CH}]{1,2}// {if ($$2>8) {print $$1,$$2}} \ +# /^[${UCH}][=${CH}]{3}// {if ($$2>6) {print $$1,$$2}} \ +# /^[${UCH}][=${CH}]{4}// {if ($$2>3) {print $$1,$$2}} \ +# /^[${UCH}][=${CH}]{5,7}// {if ($$2>2) {print $$1,$$2}} \ +# /^[${UCH}][=${CH}]{8,}// {if ($$2>1) {print $$1,$$2}}' # endef
@@ -413,21 +413,21 @@ munched.%: ${LANGUAGE}.words nb.aff.munc -e 's/(er/.*I.*)V/\1/' \ -e 's/(e/.*B.*)W/\1/' \ -e 's/([^ei]um/.*B.*)I/\1/' \ - | $(SED) -e N -e 's/^(([-${CH}])*([^e][^r]|[e][^r]|[r][^e]))/([A-Zt-z]*)\n\1e/([A-Zt-z]*)R([A-Zt-z]*)$$/\1/\4*\1e/\5\6/g' \ + | $(SED) -e N -e 's/^(([=${CH}])*([^e][^r]|[e][^r]|[r][^e]))/([A-Zt-z]*)\n\1e/([A-Zt-z]*)R([A-Zt-z]*)$$/\1/\4*\1e/\5\6/g' \ -e '$$ p' -e '$$ d' -e P -e D \ | tr '*' '\n' \ - | $(SED) -e N -e 's/^(([-${CH}])*)(/[AB]*)E(.*)\n\1er/AI/\1\3\4*\1er/AI/' \ + | $(SED) -e N -e 's/^(([=${CH}])*)(/[AB]*)E(.*)\n\1er/AI/\1\3\4*\1er/AI/' \ -e '$$ p' -e '$$ d' -e P -e D \ | tr '*' '\n' \ | $(SED) -e '$(STREKSUBSTSED)' \ -e 's//([${SUFF}]*)([${PRE}]*)//\2\1/' \ - -e 's/(([-${CH}])*)/([${PRE}]*)([${SUFF}]+)$$/\1/\3*\1/\3\4/' \ - -e 's/^([-${CH}]*)$$/\1/ /' \ + -e 's/(([=${CH}])*)/([${PRE}]*)([${SUFF}]+)$$/\1/\3*\1/\3\4/' \ + -e 's/^([=${CH}]*)$$/\1/ /' \ | tr '*' '\n' \ | $(SED) -e ':START' \ - -e 's/^([-${CH}]+)/([${PRE}]*)([${SUFF}]+)([${SUFF}])/\1/\2\3*\1/\2\4/' \ + -e 's/^([=${CH}]+)/([${PRE}]*)([${SUFF}]+)([${SUFF}])/\1/\2\3*\1/\2\4/' \ -e 't START' \ - -e 's/^([-${CH}]+)/([${PRE}]+)(*|$$)/\1/*\1/\2\3/'\ + -e 's/^([=${CH}]+)/([${PRE}]+)(*|$$)/\1/*\1/\2\3/'\ | tr '*' '\n' > munch2.tmp # This pipe produce a file containing the a line number of munch2.tmp and # the frequency indicator for that line. Note that the summation rule @@ -435,7 +435,7 @@ munched.%: ${LANGUAGE}.words nb.aff.munc cat munch2.tmp \ | tr -d ' ' \ | ispell -e -d ./nb.munch.hash \ - | $(SED) -e 's/^[-${CH}]+ //' -e '$(STREKSUBSTSED)' \ + | $(SED) -e 's/^[=${CH}]+ //' -e '$(STREKSUBSTSED)' \ | $(AWK) --source '{i=0; while (i<NF) {i=i+1;print $$i,NR}}' \ | sort \ | join - ${LANGUAGE}.words \ @@ -465,8 +465,8 @@ munched.%: ${LANGUAGE}.words nb.aff.munc | uniq \ | tr -d ' ' \ | $(SED) -e '$$ p' -e '$$ D' -e ':START' -e '$$ ! N' \ - -e 's/^(([-${CH}])+)/([0-9]*)\n\1/([${SUFF}${PRE}0-9]*)$$/\1/\3\4/' \ - -e 's/^(([-${CH}])+/)([0-9]*)([${PRE}]*)([${SUFF}0-9]*)\n\1\4([${SUFF}0-9]+)$$/\1\3\4\5\6/' \ + -e 's/^(([=${CH}])+)/([0-9]*)\n\1/([${SUFF}${PRE}0-9]*)$$/\1/\3\4/' \ + -e 's/^(([=${CH}])+/)([0-9]*)([${PRE}]*)([${SUFF}0-9]*)\n\1\4([${SUFF}0-9]+)$$/\1\3\4\5\6/' \ -e 't START' -e P -e D \ | $(SED) -e 's//([${SUFF}0-9${PRE}]*)//\1* \1/' \ | tr '*' '\n' \ @@ -495,7 +495,7 @@ nb.mch: forkort.txt $(patsubst %,munched
# First make a list of words with some compound flag, and a hash-file. cat forkort.txt $(patsubst %,munched.%,${CATHEGORIES}) \ - | tr -d '-0-9 ' \ + | tr -d '=0-9 ' \ | grep "/.*[z\_`]" \ > comp1.tmp $(BUILDHASH) comp1.tmp nb.aff comp.hash @@ -508,13 +508,13 @@ nb.mch: forkort.txt $(patsubst %,munched
cat -n forkort.txt $(patsubst %,munched.%,${CATHEGORIES}) \ | grep -v "/.*[z\_`]" \ - | $(AWK) --source '/-/ {if ($$3<${COMPOUNDLIMIT}) {print $$1,$$2,$$3}}' \ + | $(AWK) --source '/=/ {if ($$3<${COMPOUNDLIMIT}) {print $$1,$$2,$$3}}' \ > comp2.tmp # Test which words are accepted by ispell. Output is a list of line # numbers indicating the lines that can be removed from the munched # file. cat comp2.tmp \ - | tr -d '-0-9 ' \ + | tr -d '=0-9 ' \ | ispell -e -d ./comp.hash \ | $(SED) -e 's/$$/ xyxyxyxy/' \ | ispell -l -d ./comp.hash \ @@ -531,7 +531,7 @@ nb.mch: forkort.txt $(patsubst %,munched cat -n forkort.txt $(patsubst %,munched.%,${CATHEGORIES}) \ | sort -n -m -s +0 -1 comp3.tmp - \ | $(SED) -e '/^[0-9]+$$/,/.*/ D' -e '/(xxxx|yyyy)// D' \ - | tr -d '- 0-9' \ + | tr -d '= 0-9' \ | LC_COLLATE=C sort > $@ rm -f comp.hash comp[123].tmp*
@@ -551,9 +551,9 @@ nn.mch: ${LANGUAGE}.words nn.aff.munch ${CATNOHEADER} \ | grep '*' \ | $(SED) -e 's/ .*//' \ - | tr -d '-' \ + | tr -d '=' \ | munchlist -v -l nn.aff.munch \ - | $(SED) -e N -e 's/^(([-${CH}])*)er/(.*F.*)\n\1rar/M$$/\1er/\3D/' \ + | $(SED) -e N -e 's/^(([=${CH}])*)er/(.*F.*)\n\1rar/M$$/\1er/\3D/' \ -e '$$ p' -e '$$ d' -e P -e D \ | LC_COLLATE=C sort > $@
@@ -566,7 +566,7 @@ words.nb: ${LANGUAGE}.words | grep '[BANDS]$$' \ | tr -d '*' \ | $(AWK) --re-interval --source ${WORDSFILTER} \ - | tr -d '"-' \ + | tr -d '"=' \ | grep -v '(xxxx|yyyy|zyzyzy)' \ | sort -f \ > $@ @@ -577,7 +577,7 @@ words.nn: ${LANGUAGE}.words ${CATNOHEADER} \ | grep '*' \ | $(SED) -e 's/ .*//' \ - | tr -d '"-' \ + | tr -d '"=' \ | grep -v '(xxxx|yyyy|zyzyzy)' \ | sort -f \ > $@ @@ -588,7 +588,7 @@ words.${LANGUAGE}.%: ${LANGUAGE}.words | grep '[BANDS]$$' \ | grep ' $(patsubst words.${LANGUAGE}.%,%,$@) ' \ | $(SED) -e 's/ .*//' \ - | tr -d - \ + | tr -d = \ | grep -v '(xxxx|yyyy|zyzyzy)' \ | sort -f \ > $@