Commit 4bf4721a authored by Tomaž Erjavec's avatar Tomaž Erjavec
Browse files

Add linguistic processing.

parent 1b064ac1
GitGroup
nohup.*
*~
*.gz
*.zip
*.tar
*.diff
*.log
tmp
T = /usr/bin/python3 /usr/local/reldi/reldi-tokeniser/tokeniser.py sl
tok:
$T < CLARIN/CPZ.txt > CLARIN/CPZ.tok
$T < CLARIN/ODZ.txt > CLARIN/ODZ.tok
$T < CLARIN/SlPr1917.txt > CLARIN/SlPr1917.tok
$T < CLARIN/SlPr1920.txt > CLARIN/SlPr1920.tok
$T < CLARIN/CPZ.txt > CLARIN/CPZ.tok
$T < CLARIN/UstVol.txt > CLARIN/UstVol.tok
$T < CLARIN/ZKP.txt > CLARIN/ZKP.tok
$T < CLARIN/ZKP1929.txt > CLARIN/ZKP1929.tok
testc:
test-tei:
bin/conllu2tei.pl CLARIN/CPZ.dep < CLARIN/CPZ.xml > CLARIN/CPZ.ana.xml
$j schema/tei_clarin.rng CLARIN/CPZ.ana.xml
test-ana:
time totalepa.pl -a tok < CLARIN/CPZ.txt > CLARIN/CPZ.tok
time totalepa.pl -a pos < CLARIN/CPZ.txt > CLARIN/CPZ.tag
time totalepa.pl -a lem < CLARIN/CPZ.txt > CLARIN/CPZ.lem
time totalepa.pl -a dep < CLARIN/CPZ.txt > CLARIN/CPZ.dep
test-cnv:
$s -xsl:bin/tei2ana.xsl DARIAH/CPZ.xml > CLARIN/CPZ.xml
$j schema/tei_clarin.rng CLARIN/CPZ.xml
bug:
bin/totalepa.pl -l sr -a dep < CLARIN/CPZ.txt > CLARIN/CPZ.sr.dep
all: get move val-dariah cnv val text tok
nohup:
nohup time make all > nohup.all &
all: cnv text ana tei val-ana
xall: get move val-dariah cnv val-text text ana tei val-ana
val-ana:
$j schema/tei_clarin.rng CLARIN/*.ana.xml
tei:
bin/conllu2tei.pl CLARIN/CPZ.conllu < CLARIN/CPZ.xml > CLARIN/CPZ.ana.xml
bin/conllu2tei.pl CLARIN/CPZ.conllu < CLARIN/CPZ.xml > CLARIN/CPZ.ana.xml
bin/conllu2tei.pl CLARIN/ODZ.conllu < CLARIN/ODZ.xml > CLARIN/ODZ.ana.xml
bin/conllu2tei.pl CLARIN/SlPr1917.conllu < CLARIN/SlPr1917.xml > CLARIN/SlPr1917.ana.xml
bin/conllu2tei.pl CLARIN/SlPr1920.conllu < CLARIN/SlPr1920.xml > CLARIN/SlPr1920.ana.xml
bin/conllu2tei.pl CLARIN/CPZ.conllu < CLARIN/CPZ.xml > CLARIN/CPZ.ana.xml
bin/conllu2tei.pl CLARIN/UstVol.conllu < CLARIN/UstVol.xml > CLARIN/UstVol.ana.xml
bin/conllu2tei.pl CLARIN/ZKP.conllu < CLARIN/ZKP.xml > CLARIN/ZKP.ana.xml
bin/conllu2tei.pl CLARIN/ZKP1929.conllu < CLARIN/ZKP1929.xml > CLARIN/ZKP1929.ana.xml
ana:
bin/totalepa.pl -a dep < CLARIN/CPZ.txt > CLARIN/CPZ.conllu
bin/totalepa.pl -a dep < CLARIN/ODZ.txt > CLARIN/ODZ.conllu
bin/totalepa.pl -a dep < CLARIN/SlPr1917.txt > CLARIN/SlPr1917.conllu
bin/totalepa.pl -a dep < CLARIN/SlPr1920.txt > CLARIN/SlPr1920.conllu
bin/totalepa.pl -a dep < CLARIN/CPZ.txt > CLARIN/CPZ.conllu
bin/totalepa.pl -a dep < CLARIN/UstVol.txt > CLARIN/UstVol.conllu
bin/totalepa.pl -a dep < CLARIN/ZKP.txt > CLARIN/ZKP.conllu
bin/totalepa.pl -a dep < CLARIN/ZKP1929.txt > CLARIN/ZKP1929.conllu
text:
$s -xsl:bin/ana2txt.xsl CLARIN/CPZ.xml > CLARIN/CPZ.txt
$s -xsl:bin/ana2txt.xsl CLARIN/ODZ.xml > CLARIN/ODZ.txt
......@@ -24,7 +47,7 @@ text:
$s -xsl:bin/ana2txt.xsl CLARIN/UstVol.xml > CLARIN/UstVol.txt
$s -xsl:bin/ana2txt.xsl CLARIN/ZKP.xml > CLARIN/ZKP.txt
$s -xsl:bin/ana2txt.xsl CLARIN/ZKP1929.xml > CLARIN/ZKP1929.txt
val:
val-text:
$j schema/tei_clarin.rng CLARIN/*.xml
cnv:
$s -xsl:bin/tei2ana.xsl DARIAH/CPZ.xml > CLARIN/CPZ.xml
......@@ -53,6 +76,6 @@ get:
cd GitGroup/ustvol/; git pull origin
cd GitGroup/zkp1890/; git pull origin
cd GitGroup/zkp1929/; git pull origin
P = parallel --gnu --halt 0 --jobs 5
p = parallel --gnu --halt 0 --jobs 5
j = java -jar /usr/local/bin/jing.jar
s = java -jar /usr/local/bin/saxon9he.jar
#!/usr/bin/perl
# Insert CONLL-U annotated text into source TEI
# It is assumed that <ab> is the only element containing text,
# and that it does not contain mixed content
# Usage:
# conllu2tei.pl <CONLL-U> < <SOURCE-TEI> > <TARGET-TEI>
#
use warnings;
use utf8;
binmode STDERR, 'utf8';
binmode STDIN, 'utf8';
binmode STDOUT, 'utf8';
#Words need ID's if they are parsed
#If ab does not have ID, them make it up, with this prefix:
$ab_prefix = 'doc';
#Prefixed to use on values, and the type of the UD linkGrp
$msd_prefix = 'mte';
$ud_prefix = 'ud-syn';
$ud_type = 'UD-SYN';
# Read in CONLL-U
$udFile = shift;
open TBL, '<:utf8', $udFile or die;
$/ = "# newpar id = ";
while (<TBL>) {
chomp; #Newpar is snipped off, a line starts with newpar_id number
push(@connlu, $_) if /\t/; #First one will be empty, so check if \t
}
close TBL;
#Read in one ab per line from source TEI
$/ = "</ab>";
$ab_n = 0;
while (<>) {
if (($prefix, $ab) = m|(.*)(<ab[ >].+</ab>)|s) {
print $prefix if $prefix;
($stag, $text, $etag) = $ab =~ m|(<ab.*?>)(.+?)(</ab>)|s
or die "WEIRD1: $ab";
if ($stag =~ m| xml:id="(.+?)"|) {$ab_id = $1}
else {
$ab_id = $ab_prefix . '.' . ++$ab_n;
#$stag =~ s| | xml:id="$ab_id" |; #No need to give abs ids
}
$text =~ s/\s+/ /gs; # Will use it for sanity check
$text =~ s/^ //;
$conllu_ab = shift(@connlu);
($conllu_incipit) = $conllu_ab =~ /\n# text = (.+)\n/
or die "WEIRD2: $conllu_ab";
die "Out of synch:\n$conllu_incipit\n$text\n"
unless $text =~ /^\Q$conllu_incipit\E/;
$teiana_ab = conllu2tei($ab_id, $conllu_ab);
print "$stag\n$teiana_ab\n$etag";
}
elsif (not m|</ab>|) {print}
else {die "WIERD3: $_"}
}
#Convert one ab into TEI
sub conllu2tei {
my $id = shift;
my $conllu = shift;
my $tei;
foreach my $sent (split(/\n\n/, $conllu)) {
next unless $sent =~ /# text = .+\n/;
my ($sent_n) = $sent =~ /# sent_id = \d+\.(\d+)/
or die "WEIRD4: $sent";
$sent_id = $id . '.' . $sent_n;
$tei .= sent2tei($sent_id, $sent);
}
$tei =~ s|<c> </c>\s*$||s;
$tei =~ s|\s+$||;
return $tei
}
#Convert one sentence into TEI
sub sent2tei {
my $id = shift;
my $conllu = shift;
my $tei;
my $tag;
my $element;
my $space;
my @ids = ();
my @toks = ();
my @deps = ();
$tei = "<s xml:id=\"$sent_id\">\n";
foreach my $line(split(/\n/, $conllu)) {
chomp;
next unless $line =~ /^\d+\t/;
my ($n, $token, $lemma, $upos, $xpos, $ufeats, $link, $role, $extra, $local)
= split /\t/, $line;
if ($xpos =~ /Z/) {$tag = 'pc'} else {$tag = 'w'}
#$role =~ s/:/_/; #Leave for now, although backwards incompatibility!
my $feats = "UposTag=$upos";
$feats .= "|$ufeats" if $ufeats ne '_';
$space = $local !~ s/SpaceAfter=No//;
$feats .= "|$local" if $local and $local ne '_';
$token = &xml_encode($token);
$lemma = &xml_encode($lemma);
if ($tag eq 'w') {
$element = "<$tag ana=\"$msd_prefix:$xpos\" msd=\"$feats\" lemma=\"$lemma\">$token</$tag>"
}
elsif ($tag eq 'pc') {
$element = "<$tag ana=\"$msd_prefix:$xpos\" msd=\"$feats\">$token</$tag>"
}
$element .= "<c> </c>" if $space;
push @ids, $id . '.t' . $n;
push @toks, $element;
push @deps, "$link\t$n\t$role" #Only if we have a parse
if $role ne '_';
}
unless (@deps) { #No parse
$tei .= join "\n", @toks;
$tei =~ s|<c> </c>\n$|\n|;
}
else { # Parsed
#Give IDs to tokens as we have a parse
foreach my $id (@ids) {
my $element = shift @toks;
$element =~ s| | xml:id="$id" |;
$tei .= "$element\n";
}
$tei =~ s|<c> </c>\n$|\n|;
$tei .= "<linkGrp type=\"$ud_type\" targFunc=\"head argument\" corresp=\"#$id\">\n";
foreach $dep (@deps) {
my ($head, $arg, $role) = split /\t/, $dep;
$head_id = $id; #if 0 points to sentence id
$head_id .= '.t' . $head if $head;
$arg_id = $id . '.t' . $arg;
$tei .= " <link ana=\"$ud_prefix:$role\" target=\"#$head_id #$arg_id\"/>\n";
}
$tei .= "</linkGrp>";
}
$tei .= "\n</s>\n<c> </c>\n";
return $tei
}
sub xml_encode {
my $str = shift;
$str =~ s|&|&amp;|g;
$str =~ s|<|&lt;|g;
$str =~ s|>|&gt;|g;
# $str =~ s|"|&quot;|g;
return $str
}
#!/usr/bin/perl
# Script to tokenise, tag, lemmatise and parse text
# Using ReLDI tokeniser and CLASSLA-StanfordNLP tagger, lemmatiser and parser
# Accepts UTF-8 plain text on STDIN, and outputs CONLL-U on STDOUT
my $DEBUG = 0; #Log to STDERR + do not delete tmp files
my $VERSION = "totalepa version 0.1";
#use strict;
use warnings;
use utf8;
use Getopt::Long; #command line option processing
#use FindBin qw($Bin); #get location of this executable
#use File::Basename;
#my ($filename, $dir) = fileparse($path);
my $HELP =
<<'EOB';
Usage: totalepa.pl [-a <analysis>] [-l <language>] < TEXT > CONLLU
Tokenise, tag, lemmatise, parse a UTF-8 text standard input to standard output.
-a specifies up to which analysis to perfom, default is 'lem'
<analysis> should be one of 'tok', 'pos', 'lem', 'dep'
-l specifies the language of the input, default is 'sl'
<language> should be one of 'sl', 'hr', 'sr'
Also recognises the following switches:
-h output this text and exit
EOB
my (%options, $anal, $lang);
my $ok_anals = " tok pos lem dep ";
my $ok_langs = " sl hr sr ";
GetOptions("help" => \$hflag,
"language=s" => \$lang,
"analysis=s" => \$anal,
);
if (defined $hflag) {
print $HELP;
exit
}
$lang = 'sl' unless defined $lang;
die "Supported languages are $ok_langs not $lang!\n"
unless $ok_langs =~ / $lang /;
$anal = 'lem' unless defined $anal;
die "Supported analyses are $ok_anals not $anal!\n"
unless $ok_anals =~ / $anal /;
if ($anal eq 'tok') {$ok_anals =~ s/ pos lem dep / /}
elsif ($anal eq 'pos') {$ok_anals =~ s/ lem dep / /}
elsif ($anal eq 'lem') {$ok_anals =~ s/ dep / /}
elsif ($anal eq 'dep') {}
binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
use File::Temp qw/ tempfile tempdir /; #creation of tmp files and directory
$status = system("mkdir -p /tmp/totalepa");
die "Can't make tmp dir: $status!\n" if $status;
my $tempdirroot = "/tmp/totalepa";
if ($DEBUG) {$CLEANUP = 0}
else {$CLEANUP = 1}
my $tempdir = tempdir(DIR => $tempdirroot, CLEANUP => $CLEANUP);
my $f_txt = "$tempdir/file.txt";
my $f_tok = "$tempdir/file.conllu";
my $f_pos = "$tempdir/file.pos.conllu";
my $f_lem = "$tempdir/file.pos.lemma.conllu";
my $f_dep = "$tempdir/file.pos.lemma.dep.conllu";
$tokeniser = "python /usr/local/reldi/reldi-tokeniser/tokeniser.py -c $lang ";
$stanford = '/usr/local/classla-stanfordnlp';
$tagger = 'python -m stanfordnlp.models.tagger --mode predict --save_dir models/pos/ ';
if ($lang eq 'sl') {$tagger .= '--save_name ssj500k --shorthand sl_ssj'}
elsif ($lang eq 'hr') {$tagger .= '--save_name hr500k --shorthand hr_set'}
elsif ($lang eq 'sr') {$tagger .= '--save_name SETimes.SR --shorthand sr_set'}
else {die}
$lemmatiser= 'python -m stanfordnlp.models.lemmatizer --mode predict --model_dir models/lemma/ ';
if ($lang eq 'sl') {$lemmatiser .= '--model_file ssj500k+Sloleks'}
elsif ($lang eq 'hr') {$lemmatiser .= '--model_file hr500k+hrLex'}
elsif ($lang eq 'sr') {$lemmatiser .= '--model_file SETimes.SR+bsrLex'}
else {die}
$parser = 'python -m stanfordnlp.models.parser --mode predict --save_dir models/depparse/ ';
if ($lang eq 'sl') {$parser .= '--save_name ssj500k_ud --shorthand sl_ssj'}
elsif ($lang eq 'hr') {$parser .= '--save_name hr500k_ud --shorthand hr_set'}
elsif ($lang eq 'sr') {$parser .= '--save_name SETimes.SR --shorthand sr_set'}
else {die}
undef $/;
#Copy STDIN to TMP file
open(TXT, '>:utf8', $f_txt) or die "Can't open $f_txt!\n";
print TXT <>;
close TXT;
#Tokenise
if ($ok_anals =~ / tok /) {
print STDERR "INFO: tokenising...\n";
$prog = "$tokeniser < $f_txt > $f_tok";
$status = system($prog);
die "Tokenisation failed: $status!\n" if $status;
$out = $f_tok;
}
#PoS tag
if ($ok_anals =~ / pos /) {
print STDERR "INFO: PoS tagging...\n";
$prog = "cd $stanford; $tagger --eval_file $f_tok --output_file $f_pos";
print STDERR "INFO: $prog\n" if $DEBUG;
$status = system("$prog > /dev/null");
die "PoS tagging failed: $status!\n" if $status;
$out = $f_pos;
}
#Lemmatise
if ($ok_anals =~ / lem /) {
print STDERR "INFO: lemmatising...\n";
$prog = "cd $stanford; $lemmatiser --eval_file $f_pos --output_file $f_lem";
print STDERR "INFO: $prog\n" if $DEBUG;
$status = system("$prog > /dev/null");
die "Lemmatisation failed: $status!\n" if $status;
$out = $f_lem;
}
#Parse
if ($ok_anals =~ / dep /) {
print STDERR "INFO: parsing...\n";
$prog = "cd $stanford; $parser --eval_file $f_lem --output_file $f_dep ";
print STDERR "INFO: $prog\n" if $DEBUG;
$status = system("$prog > /dev/null");
die "Parsing failed: $status!\n" if $status;
$out = $f_dep;
}
#Print to STDOUT
open(OUT, '<:utf8', $out) or die "Can't open $out!\n";
print <OUT>;
close OUT
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment