# Eelco Mossel, March 2008. # # First argument: file name of the input file; the input file is read as UTF-8. # Output is written to STDOUT, also as UTF-8. # # This script generates a reduced/simplified version of the LT4eL lexicons, retaining all information # that is used by the LT4eL services and by the ontology navigation feature in ILIAS. # # For each element, only the first or element within an entry is preserved. # The children are removed from it. All following elements up to (not including) the first # element are removed as well. # Starting from the first element, the rest of the entry is left unchanged. # # The script does not parse XML; rather, it assumes a certain layout, # with respect to line breaks and order of elements: # If the or element are on the same line as the first or element, # they will be missing from the output. # # Input elements, example: # # # # # # Any Entity that cannot be located in space-time. E.g. mathematical entities: formal semantics elements, regions within dimensional spaces, etc.Astratto # # # Any Entity that cannot be located in space-time. E.g. mathematical entities: formal semantics elements, regions within dimensional spaces, etc. # # Entwurf # Abstrakte # # # # # Output for this example: # # # Any Entity that cannot be located in space-time. E.g. mathematical entities: formal semantics elements, regions within dimensional spaces, etc. # # Entwurf # Abstrakte # # # use strict; use warnings; open(LEX, $ARGV[0]); binmode(LEX, ':utf8'); binmode(STDOUT, ':utf8'); my $doWrite = 1; my $writeMainEntryElement = 0; my $line = ; while ($line) { if ($line =~ /(\s*)(\s*<((owl|rdf):[^ ]+?) .*?>)/) { # entry + first owl/rdf element on one line print "$1\n $2\n"; $doWrite = 0; # do not write next lines, unless it contains or )/; print "$1\n"; $doWrite = 0; $writeMainEntryElement = 0; } elsif ($line =~ /(\s*)/ ) { print "$1\n"; $writeMainEntryElement = 1; } elsif ($doWrite) { print $line; } elsif ($line =~ // || $line =~ /; }