Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Tomaž Erjavec
siIUS
Commits
4bf4721a
Commit
4bf4721a
authored
Feb 07, 2020
by
Tomaž Erjavec
Browse files
Add linguistic processing.
parent
1b064ac1
Changes
4
Hide whitespace changes
Inline
Side-by-side
.gitignore
View file @
4bf4721a
GitGroup
nohup.*
*~
*.gz
*.zip
*.tar
*.diff
*.log
tmp
Makefile
View file @
4bf4721a
T
=
/usr/bin/python3 /usr/local/reldi/reldi-tokeniser/tokeniser.py sl
tok
:
$T
< CLARIN/CPZ.txt
>
CLARIN/CPZ.tok
$T
< CLARIN/ODZ.txt
>
CLARIN/ODZ.tok
$T
< CLARIN/SlPr1917.txt
>
CLARIN/SlPr1917.tok
$T
< CLARIN/SlPr1920.txt
>
CLARIN/SlPr1920.tok
$T
< CLARIN/CPZ.txt
>
CLARIN/CPZ.tok
$T
< CLARIN/UstVol.txt
>
CLARIN/UstVol.tok
$T
< CLARIN/ZKP.txt
>
CLARIN/ZKP.tok
$T
< CLARIN/ZKP1929.txt
>
CLARIN/ZKP1929.tok
testc
:
test-tei
:
bin/conllu2tei.pl CLARIN/CPZ.dep < CLARIN/CPZ.xml
>
CLARIN/CPZ.ana.xml
$j
schema/tei_clarin.rng CLARIN/CPZ.ana.xml
test-ana
:
time
totalepa.pl
-a
tok < CLARIN/CPZ.txt
>
CLARIN/CPZ.tok
time
totalepa.pl
-a
pos < CLARIN/CPZ.txt
>
CLARIN/CPZ.tag
time
totalepa.pl
-a
lem < CLARIN/CPZ.txt
>
CLARIN/CPZ.lem
time
totalepa.pl
-a
dep < CLARIN/CPZ.txt
>
CLARIN/CPZ.dep
test-cnv
:
$s
-xsl
:bin/tei2ana.xsl DARIAH/CPZ.xml
>
CLARIN/CPZ.xml
$j
schema/tei_clarin.rng CLARIN/CPZ.xml
bug
:
bin/totalepa.pl
-l
sr
-a
dep < CLARIN/CPZ.txt
>
CLARIN/CPZ.sr.dep
all
:
get move val-dariah cnv val text tok
nohup
:
nohup time
make all
>
nohup.all &
all
:
cnv text ana tei val-ana
xall
:
get move val-dariah cnv val-text text ana tei val-ana
val-ana
:
$j
schema/tei_clarin.rng CLARIN/
*
.ana.xml
tei
:
bin/conllu2tei.pl CLARIN/CPZ.conllu < CLARIN/CPZ.xml
>
CLARIN/CPZ.ana.xml
bin/conllu2tei.pl CLARIN/CPZ.conllu < CLARIN/CPZ.xml
>
CLARIN/CPZ.ana.xml
bin/conllu2tei.pl CLARIN/ODZ.conllu < CLARIN/ODZ.xml
>
CLARIN/ODZ.ana.xml
bin/conllu2tei.pl CLARIN/SlPr1917.conllu < CLARIN/SlPr1917.xml
>
CLARIN/SlPr1917.ana.xml
bin/conllu2tei.pl CLARIN/SlPr1920.conllu < CLARIN/SlPr1920.xml
>
CLARIN/SlPr1920.ana.xml
bin/conllu2tei.pl CLARIN/CPZ.conllu < CLARIN/CPZ.xml
>
CLARIN/CPZ.ana.xml
bin/conllu2tei.pl CLARIN/UstVol.conllu < CLARIN/UstVol.xml
>
CLARIN/UstVol.ana.xml
bin/conllu2tei.pl CLARIN/ZKP.conllu < CLARIN/ZKP.xml
>
CLARIN/ZKP.ana.xml
bin/conllu2tei.pl CLARIN/ZKP1929.conllu < CLARIN/ZKP1929.xml
>
CLARIN/ZKP1929.ana.xml
ana
:
bin/totalepa.pl
-a
dep < CLARIN/CPZ.txt
>
CLARIN/CPZ.conllu
bin/totalepa.pl
-a
dep < CLARIN/ODZ.txt
>
CLARIN/ODZ.conllu
bin/totalepa.pl
-a
dep < CLARIN/SlPr1917.txt
>
CLARIN/SlPr1917.conllu
bin/totalepa.pl
-a
dep < CLARIN/SlPr1920.txt
>
CLARIN/SlPr1920.conllu
bin/totalepa.pl
-a
dep < CLARIN/CPZ.txt
>
CLARIN/CPZ.conllu
bin/totalepa.pl
-a
dep < CLARIN/UstVol.txt
>
CLARIN/UstVol.conllu
bin/totalepa.pl
-a
dep < CLARIN/ZKP.txt
>
CLARIN/ZKP.conllu
bin/totalepa.pl
-a
dep < CLARIN/ZKP1929.txt
>
CLARIN/ZKP1929.conllu
text
:
$s
-xsl
:bin/ana2txt.xsl CLARIN/CPZ.xml
>
CLARIN/CPZ.txt
$s
-xsl
:bin/ana2txt.xsl CLARIN/ODZ.xml
>
CLARIN/ODZ.txt
...
...
@@ -24,7 +47,7 @@ text:
$s
-xsl
:bin/ana2txt.xsl CLARIN/UstVol.xml
>
CLARIN/UstVol.txt
$s
-xsl
:bin/ana2txt.xsl CLARIN/ZKP.xml
>
CLARIN/ZKP.txt
$s
-xsl
:bin/ana2txt.xsl CLARIN/ZKP1929.xml
>
CLARIN/ZKP1929.txt
val
:
val
-text
:
$j
schema/tei_clarin.rng CLARIN/
*
.xml
cnv
:
$s
-xsl
:bin/tei2ana.xsl DARIAH/CPZ.xml
>
CLARIN/CPZ.xml
...
...
@@ -53,6 +76,6 @@ get:
cd
GitGroup/ustvol/
;
git pull origin
cd
GitGroup/zkp1890/
;
git pull origin
cd
GitGroup/zkp1929/
;
git pull origin
P
=
parallel
--gnu
--halt
0
--jobs
5
p
=
parallel
--gnu
--halt
0
--jobs
5
j
=
java
-jar
/usr/local/bin/jing.jar
s
=
java
-jar
/usr/local/bin/saxon9he.jar
bin/conllu2tei.pl
0 → 100755
View file @
4bf4721a
#!/usr/bin/perl
# Insert CONLL-U annotated text into source TEI
# It is assumed that <ab> is the only element containing text,
# and that it does not contain mixed content
# Usage:
# conllu2tei.pl <CONLL-U> < <SOURCE-TEI> > <TARGET-TEI>
#
use
warnings
;
use
utf8
;
binmode
STDERR
,
'
utf8
';
binmode
STDIN
,
'
utf8
';
binmode
STDOUT
,
'
utf8
';
#Words need ID's if they are parsed
#If ab does not have ID, them make it up, with this prefix:
$ab_prefix
=
'
doc
';
#Prefixed to use on values, and the type of the UD linkGrp
$msd_prefix
=
'
mte
';
$ud_prefix
=
'
ud-syn
';
$ud_type
=
'
UD-SYN
';
# Read in CONLL-U
$udFile
=
shift
;
open
TBL
,
'
<:utf8
',
$udFile
or
die
;
$/
=
"
# newpar id =
";
while
(
<
TBL
>
)
{
chomp
;
#Newpar is snipped off, a line starts with newpar_id number
push
(
@connlu
,
$_
)
if
/\t/
;
#First one will be empty, so check if \t
}
close
TBL
;
#Read in one ab per line from source TEI
$/
=
"
</ab>
";
$ab_n
=
0
;
while
(
<>
)
{
if
((
$prefix
,
$ab
)
=
m|(.*)(<ab[ >].+</ab>)|s
)
{
print
$prefix
if
$prefix
;
(
$stag
,
$text
,
$etag
)
=
$ab
=~
m|(<ab.*?>)(.+?)(</ab>)|s
or
die
"
WEIRD1:
$ab
";
if
(
$stag
=~
m| xml:id="(.+?)"|
)
{
$ab_id
=
$
1
}
else
{
$ab_id
=
$ab_prefix
.
'
.
'
.
++
$ab_n
;
#$stag =~ s| | xml:id="$ab_id" |; #No need to give abs ids
}
$text
=~
s/\s+/ /gs
;
# Will use it for sanity check
$text
=~
s/^ //
;
$conllu_ab
=
shift
(
@connlu
);
(
$conllu_incipit
)
=
$conllu_ab
=~
/\n# text = (.+)\n/
or
die
"
WEIRD2:
$conllu_ab
";
die
"
Out of synch:
\n
$conllu_incipit
\n
$text
\n
"
unless
$text
=~
/^\Q$conllu_incipit\E/
;
$teiana_ab
=
conllu2tei
(
$ab_id
,
$conllu_ab
);
print
"
$stag
\n
$teiana_ab
\n
$etag
";
}
elsif
(
not
m|</ab>|
)
{
print
}
else
{
die
"
WIERD3:
$_
"}
}
#Convert one ab into TEI
sub
conllu2tei
{
my
$id
=
shift
;
my
$conllu
=
shift
;
my
$tei
;
foreach
my
$sent
(
split
(
/\n\n/
,
$conllu
))
{
next
unless
$sent
=~
/# text = .+\n/
;
my
(
$sent_n
)
=
$sent
=~
/# sent_id = \d+\.(\d+)/
or
die
"
WEIRD4:
$sent
";
$sent_id
=
$id
.
'
.
'
.
$sent_n
;
$tei
.=
sent2tei
(
$sent_id
,
$sent
);
}
$tei
=~
s|<c> </c>\s*$||s
;
$tei
=~
s|\s+$||
;
return
$tei
}
#Convert one sentence into TEI
sub
sent2tei
{
my
$id
=
shift
;
my
$conllu
=
shift
;
my
$tei
;
my
$tag
;
my
$element
;
my
$space
;
my
@ids
=
();
my
@toks
=
();
my
@deps
=
();
$tei
=
"
<s xml:id=
\"
$sent_id
\"
>
\n
";
foreach
my
$line
(
split
(
/\n/
,
$conllu
))
{
chomp
;
next
unless
$line
=~
/^\d+\t/
;
my
(
$n
,
$token
,
$lemma
,
$upos
,
$xpos
,
$ufeats
,
$link
,
$role
,
$extra
,
$local
)
=
split
/\t/
,
$line
;
if
(
$xpos
=~
/Z/
)
{
$tag
=
'
pc
'}
else
{
$tag
=
'
w
'}
#$role =~ s/:/_/; #Leave for now, although backwards incompatibility!
my
$feats
=
"
UposTag=
$upos
";
$feats
.=
"
|
$ufeats
"
if
$ufeats
ne
'
_
';
$space
=
$local
!~
s/SpaceAfter=No//
;
$feats
.=
"
|
$local
"
if
$local
and
$local
ne
'
_
';
$token
=
&xml_encode
(
$token
);
$lemma
=
&xml_encode
(
$lemma
);
if
(
$tag
eq
'
w
')
{
$element
=
"
<
$tag
ana=
\"
$msd_prefix
:
$xpos
\"
msd=
\"
$feats
\"
lemma=
\"
$lemma
\"
>
$token
</
$tag
>
"
}
elsif
(
$tag
eq
'
pc
')
{
$element
=
"
<
$tag
ana=
\"
$msd_prefix
:
$xpos
\"
msd=
\"
$feats
\"
>
$token
</
$tag
>
"
}
$element
.=
"
<c> </c>
"
if
$space
;
push
@ids
,
$id
.
'
.t
'
.
$n
;
push
@toks
,
$element
;
push
@deps
,
"
$link
\t
$n
\t
$role
"
#Only if we have a parse
if
$role
ne
'
_
';
}
unless
(
@deps
)
{
#No parse
$tei
.=
join
"
\n
",
@toks
;
$tei
=~
s|<c> </c>\n$|\n|
;
}
else
{
# Parsed
#Give IDs to tokens as we have a parse
foreach
my
$id
(
@ids
)
{
my
$element
=
shift
@toks
;
$element
=~
s| | xml:id="$id" |
;
$tei
.=
"
$element
\n
";
}
$tei
=~
s|<c> </c>\n$|\n|
;
$tei
.=
"
<linkGrp type=
\"
$ud_type
\"
targFunc=
\"
head argument
\"
corresp=
\"
#
$id
\"
>
\n
";
foreach
$dep
(
@deps
)
{
my
(
$head
,
$arg
,
$role
)
=
split
/\t/
,
$dep
;
$head_id
=
$id
;
#if 0 points to sentence id
$head_id
.=
'
.t
'
.
$head
if
$head
;
$arg_id
=
$id
.
'
.t
'
.
$arg
;
$tei
.=
"
<link ana=
\"
$ud_prefix
:
$role
\"
target=
\"
#
$head_id
#
$arg_id
\"
/>
\n
";
}
$tei
.=
"
</linkGrp>
";
}
$tei
.=
"
\n
</s>
\n
<c> </c>
\n
";
return
$tei
}
sub
xml_encode
{
my
$str
=
shift
;
$str
=~
s|&|&|g
;
$str
=~
s|<|<|g
;
$str
=~
s|>|>|g
;
# $str =~ s|"|"|g;
return
$str
}
bin/totalepa.pl
0 → 100755
View file @
4bf4721a
#!/usr/bin/perl
# Script to tokenise, tag, lemmatise and parse text
# Using ReLDI tokeniser and CLASSLA-StanfordNLP tagger, lemmatiser and parser
# Accepts UTF-8 plain text on STDIN, and outputs CONLL-U on STDOUT
my
$DEBUG
=
0
;
#Log to STDERR + do not delete tmp files
my
$VERSION
=
"
totalepa version 0.1
";
#use strict;
use
warnings
;
use
utf8
;
use
Getopt::
Long
;
#command line option processing
#use FindBin qw($Bin); #get location of this executable
#use File::Basename;
#my ($filename, $dir) = fileparse($path);
my
$HELP
=
<<'EOB';
Usage: totalepa.pl [-a <analysis>] [-l <language>] < TEXT > CONLLU
Tokenise, tag, lemmatise, parse a UTF-8 text standard input to standard output.
-a specifies up to which analysis to perfom, default is 'lem'
<analysis> should be one of 'tok', 'pos', 'lem', 'dep'
-l specifies the language of the input, default is 'sl'
<language> should be one of 'sl', 'hr', 'sr'
Also recognises the following switches:
-h output this text and exit
EOB
my
(
%options
,
$anal
,
$lang
);
my
$ok_anals
=
"
tok pos lem dep
";
my
$ok_langs
=
"
sl hr sr
";
GetOptions
("
help
"
=>
\
$hflag
,
"
language=s
"
=>
\
$lang
,
"
analysis=s
"
=>
\
$anal
,
);
if
(
defined
$hflag
)
{
print
$HELP
;
exit
}
$lang
=
'
sl
'
unless
defined
$lang
;
die
"
Supported languages are
$ok_langs
not
$lang
!
\n
"
unless
$ok_langs
=~
/ $lang /
;
$anal
=
'
lem
'
unless
defined
$anal
;
die
"
Supported analyses are
$ok_anals
not
$anal
!
\n
"
unless
$ok_anals
=~
/ $anal /
;
if
(
$anal
eq
'
tok
')
{
$ok_anals
=~
s/ pos lem dep / /
}
elsif
(
$anal
eq
'
pos
')
{
$ok_anals
=~
s/ lem dep / /
}
elsif
(
$anal
eq
'
lem
')
{
$ok_anals
=~
s/ dep / /
}
elsif
(
$anal
eq
'
dep
')
{}
binmode
(
STDIN
,
"
:utf8
");
binmode
(
STDOUT
,
"
:utf8
");
binmode
(
STDERR
,
"
:utf8
");
use
File::
Temp
qw/ tempfile tempdir /
;
#creation of tmp files and directory
$status
=
system
("
mkdir -p /tmp/totalepa
");
die
"
Can't make tmp dir:
$status
!
\n
"
if
$status
;
my
$tempdirroot
=
"
/tmp/totalepa
";
if
(
$DEBUG
)
{
$CLEANUP
=
0
}
else
{
$CLEANUP
=
1
}
my
$tempdir
=
tempdir
(
DIR
=>
$tempdirroot
,
CLEANUP
=>
$CLEANUP
);
my
$f_txt
=
"
$tempdir
/file.txt
";
my
$f_tok
=
"
$tempdir
/file.conllu
";
my
$f_pos
=
"
$tempdir
/file.pos.conllu
";
my
$f_lem
=
"
$tempdir
/file.pos.lemma.conllu
";
my
$f_dep
=
"
$tempdir
/file.pos.lemma.dep.conllu
";
$tokeniser
=
"
python /usr/local/reldi/reldi-tokeniser/tokeniser.py -c
$lang
";
$stanford
=
'
/usr/local/classla-stanfordnlp
';
$tagger
=
'
python -m stanfordnlp.models.tagger --mode predict --save_dir models/pos/
';
if
(
$lang
eq
'
sl
')
{
$tagger
.=
'
--save_name ssj500k --shorthand sl_ssj
'}
elsif
(
$lang
eq
'
hr
')
{
$tagger
.=
'
--save_name hr500k --shorthand hr_set
'}
elsif
(
$lang
eq
'
sr
')
{
$tagger
.=
'
--save_name SETimes.SR --shorthand sr_set
'}
else
{
die
}
$lemmatiser
=
'
python -m stanfordnlp.models.lemmatizer --mode predict --model_dir models/lemma/
';
if
(
$lang
eq
'
sl
')
{
$lemmatiser
.=
'
--model_file ssj500k+Sloleks
'}
elsif
(
$lang
eq
'
hr
')
{
$lemmatiser
.=
'
--model_file hr500k+hrLex
'}
elsif
(
$lang
eq
'
sr
')
{
$lemmatiser
.=
'
--model_file SETimes.SR+bsrLex
'}
else
{
die
}
$parser
=
'
python -m stanfordnlp.models.parser --mode predict --save_dir models/depparse/
';
if
(
$lang
eq
'
sl
')
{
$parser
.=
'
--save_name ssj500k_ud --shorthand sl_ssj
'}
elsif
(
$lang
eq
'
hr
')
{
$parser
.=
'
--save_name hr500k_ud --shorthand hr_set
'}
elsif
(
$lang
eq
'
sr
')
{
$parser
.=
'
--save_name SETimes.SR --shorthand sr_set
'}
else
{
die
}
undef
$/
;
#Copy STDIN to TMP file
open
(
TXT
,
'
>:utf8
',
$f_txt
)
or
die
"
Can't open
$f_txt
!
\n
";
print
TXT
<>
;
close
TXT
;
#Tokenise
if
(
$ok_anals
=~
/ tok /
)
{
print
STDERR
"
INFO: tokenising...
\n
";
$prog
=
"
$tokeniser
<
$f_txt
>
$f_tok
";
$status
=
system
(
$prog
);
die
"
Tokenisation failed:
$status
!
\n
"
if
$status
;
$out
=
$f_tok
;
}
#PoS tag
if
(
$ok_anals
=~
/ pos /
)
{
print
STDERR
"
INFO: PoS tagging...
\n
";
$prog
=
"
cd
$stanford
;
$tagger
--eval_file
$f_tok
--output_file
$f_pos
";
print
STDERR
"
INFO:
$prog
\n
"
if
$DEBUG
;
$status
=
system
("
$prog
> /dev/null
");
die
"
PoS tagging failed:
$status
!
\n
"
if
$status
;
$out
=
$f_pos
;
}
#Lemmatise
if
(
$ok_anals
=~
/ lem /
)
{
print
STDERR
"
INFO: lemmatising...
\n
";
$prog
=
"
cd
$stanford
;
$lemmatiser
--eval_file
$f_pos
--output_file
$f_lem
";
print
STDERR
"
INFO:
$prog
\n
"
if
$DEBUG
;
$status
=
system
("
$prog
> /dev/null
");
die
"
Lemmatisation failed:
$status
!
\n
"
if
$status
;
$out
=
$f_lem
;
}
#Parse
if
(
$ok_anals
=~
/ dep /
)
{
print
STDERR
"
INFO: parsing...
\n
";
$prog
=
"
cd
$stanford
;
$parser
--eval_file
$f_lem
--output_file
$f_dep
";
print
STDERR
"
INFO:
$prog
\n
"
if
$DEBUG
;
$status
=
system
("
$prog
> /dev/null
");
die
"
Parsing failed:
$status
!
\n
"
if
$status
;
$out
=
$f_dep
;
}
#Print to STDOUT
open
(
OUT
,
'
<:utf8
',
$out
)
or
die
"
Can't open
$out
!
\n
";
print
<
OUT
>
;
close
OUT
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment