version 1.10, 2008/04/29 10:15:58
|
version 1.21, 2014/08/25 18:02:48
|
Line 1
|
Line 1
|
# The LearningOnline Network |
# The LearningOnline Network |
# entity -> tex. |
# entity -> tex. |
# |
# |
# |
# $Id$ |
# |
# |
# Copyright Michigan State University Board of Trustees |
# Copyright Michigan State University Board of Trustees |
# |
# |
Line 25
|
Line 25
|
# http://www.lon-capa.org/ |
# http://www.lon-capa.org/ |
# |
# |
# |
# |
|
|
package Apache::entities; |
package Apache::entities; |
use strict; |
use strict; |
# |
|
# This file contains a table driven entity-->latex converter. |
|
# |
|
# Assumptions: |
|
# The number of entities in a resource is small compared with the |
|
# number of possible entities that might be translated. |
|
# Therefore the strategy is to match a general entity pattern |
|
# &.+; over and over, pull out the match look it up in an entity -> tex hash |
|
# and do the replacement. |
|
# |
|
# In order to simplify the hash, the following reductions are done: |
|
# &#d+; have the &# and ; stripped and is converted to an int. |
|
# &#.+; have the &#x and ; stripped and is converted to an int as a hex |
|
# value. |
|
# All others have the & and ; stripped. |
|
|
|
|
=pod |
|
|
# The hash: Add new conversions here; leave off the leading & and the trailing ; |
=head1 TABLES ASCII code page |
# all numeric entities need only appear as their decimal versions |
|
# (e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. |
|
# |
|
# This entity table is mercilessly cribbed from the HTML pocket reference |
|
# table starting at pg 82. In most cases the LaTeX equivalent codes come from |
|
# the original massive regular expression replacements originally by |
|
# A. Sakharuk in lonprintout.pm |
|
# |
|
# I also want to acknowledge |
|
# ISO Character entities and their LaTeX equivalents by |
|
# Vidar Bronken Gundersen, and Rune Mathisen |
|
# http://www.bitjungle.com/isoent-ref.pdf |
|
# |
|
|
|
# Note numerical entities are essentially unicode character codes. |
=over |
# |
|
package Apache::entities; |
|
|
|
my %entities = ( |
=item (7-13) |
|
|
|
Translation to empty strings |
|
|
|
=item (32-126) |
|
|
|
Translations to simple characters |
|
|
|
=item (130-140) |
|
|
|
Controls and Latin-1 supplement. Note that some entities that have |
|
visible effect are not printing unicode characters. Specifically |
|
‚-  |
|
|
|
=item (145-156) |
|
|
|
There's a gap here in my entity table |
|
|
|
=item (159-255) |
|
|
|
Another short gap |
|
|
|
=item (295) |
|
|
|
hbar entity number comes from the unicode character: |
|
see e.g. http://www.unicode.org/charts/PDF/U0100.pdf |
|
ISO also documents a 'planck' entity. |
|
|
|
=item (338-376) |
|
|
|
Latin extended-A HTML 4.01 entities |
|
|
|
=item (402) |
|
|
|
Latin extended B HTML 4.01 entities |
|
|
|
=item (710 & 732) |
|
|
|
Spacing modifier letters |
|
|
|
=item (913-937) |
|
|
|
Greek uppercase (skipss 930) |
|
|
|
=item (945-982) |
|
|
# ---- ASCII code page: ---------------- |
Greek lowercase |
|
|
|
=item (8194-8364) |
|
|
|
The general punctuation set |
|
|
|
=item (8472-8501) |
|
|
|
Letter like symbols |
|
|
|
=item (8592-8669) |
|
|
|
Arrows and then some (harpoons from Hon Kie). |
|
|
|
=item (8704-8734) |
|
|
|
Mathematical operators. |
|
|
|
=item (8735-9830) |
|
|
|
The items below require the isoent latex package which I can't find at least for FC5. |
|
Temporarily commented out. |
|
|
|
=back |
|
|
|
=cut |
|
|
|
my %entities = ( |
|
|
# Translation to empty strings: |
# Translation to empty strings: |
|
|
Line 176 my %entities = (
|
Line 219 my %entities = (
|
125 => '\}', |
125 => '\}', |
126 => '\~', |
126 => '\~', |
|
|
# Controls and Latin-1 supplement. Note that some entities that have |
# Controls and Latin-1 supplement. |
# visible effect are not printing unicode characters. Specifically |
|
# ‚-  |
|
|
|
130 => ',', |
130 => ',', |
131 => '\ensuremath{f}', |
131 => '\ensuremath{f}', |
Line 192 my %entities = (
|
Line 233 my %entities = (
|
139 => '\ensuremath{<}', |
139 => '\ensuremath{<}', |
140 => '{\OE}', |
140 => '{\OE}', |
|
|
# There's a gap here in my entity table |
# There's a gap here in my entity table |
|
|
145 => '`', |
145 => '`', |
146 => '\'', |
146 => '\'', |
Line 206 my %entities = (
|
Line 247 my %entities = (
|
154 => '\v{s}', |
154 => '\v{s}', |
155 => '\ensuremath{>}', |
155 => '\ensuremath{>}', |
156 => '\oe ', |
156 => '\oe ', |
|
|
# Another short gap: |
# Another short gap: |
|
|
159 => '\"Y', |
159 => '\"Y', |
160 => '~', |
160 => '~', |
Line 397 my %entities = (
|
Line 438 my %entities = (
|
255 => '\\"{y}', |
255 => '\\"{y}', |
'yuml' => '\\"{y}', |
'yuml' => '\\"{y}', |
|
|
# hbar entity number comes from the unicode charater: |
|
# see e.g. http://www.unicode.org/charts/PDF/U0100.pdf |
# hbar entity number comes from the unicode character: |
# ISO also documents a 'planck' entity. |
|
|
|
295 => '\ensuremath{\hbar}', |
295 => '\ensuremath{\hbar}', |
'planck' => '\ensuremath{\hbar}', |
'planck' => '\ensuremath{\hbar}', |
Line 417 my %entities = (
|
Line 457 my %entities = (
|
376 => '\\"{Y}', |
376 => '\\"{Y}', |
'Yuml' => '\\"{Y}', |
'Yuml' => '\\"{Y}', |
|
|
|
|
# Latin extended B HTML 4.01 entities |
# Latin extended B HTML 4.01 entities |
|
|
402 => '\ensuremath{f}', |
402 => '\ensuremath{f}', |
Line 466 my %entities = (
|
Line 505 my %entities = (
|
'Pi' => '\ensuremath{\Pi}', |
'Pi' => '\ensuremath{\Pi}', |
929 => '\ensuremath{\mathrm{P}}', |
929 => '\ensuremath{\mathrm{P}}', |
'Rho' => '\ensuremath{\mathrm{P}}', |
'Rho' => '\ensuremath{\mathrm{P}}', |
|
|
# Skips 930 |
|
|
|
931 => '\ensuremath{\Sigma}', |
931 => '\ensuremath{\Sigma}', |
'Sigma' => '\ensuremath{\Sigma}', |
'Sigma' => '\ensuremath{\Sigma}', |
932 => '\ensuremath{\mathrm{T}}', |
932 => '\ensuremath{\mathrm{T}}', |
Line 484 my %entities = (
|
Line 520 my %entities = (
|
937 => '\ensuremath{\Omega}', |
937 => '\ensuremath{\Omega}', |
'Omega' => '\ensuremath{\Omega}', |
'Omega' => '\ensuremath{\Omega}', |
|
|
|
|
# Greek lowercase: |
# Greek lowercase: |
|
|
945 => '\ensuremath{\alpha}', |
945 => '\ensuremath{\alpha}', |
Line 544 my %entities = (
|
Line 579 my %entities = (
|
982 => '\ensuremath{\varpi}', |
982 => '\ensuremath{\varpi}', |
'piv' => '\ensuremath{\varpi}', |
'piv' => '\ensuremath{\varpi}', |
|
|
|
|
# The general punctuation set: |
# The general punctuation set: |
|
|
8194, => '\hspace{.5em}', |
8194, => '\hspace{.5em}', |
Line 603 my %entities = (
|
Line 637 my %entities = (
|
'euro' => '\texteuro', |
'euro' => '\texteuro', |
|
|
# Letter like symbols |
# Letter like symbols |
|
|
|
|
8472 => '\ensuremath{\wp}', |
8472 => '\ensuremath{\wp}', |
'weierp' => '\ensuremath{\wp}', |
'weierp' => '\ensuremath{\wp}', |
Line 618 my %entities = (
|
Line 651 my %entities = (
|
|
|
# Arrows and then some (harpoons from Hon Kie). |
# Arrows and then some (harpoons from Hon Kie). |
|
|
8592 => '\textleftarrow', |
8592 => '\ensuremath{\leftarrow}', |
'larr' => '\textleftarrow', |
'larr' => '\ensuremath{\leftarrow}', |
8593 => '\textuparrow', |
8593 => '\ensuremath{\uparrow}', |
'uarr' => '\textuparrow', |
'uarr' => '\ensuremath{\uparrow}', |
8594 => '\textrightarrow', |
8594 => '\ensuremath{\rightarrow}', |
'rarr' => '\textrightarrow', |
'rarr' => '\ensuremath{\rightarrow}', |
8595 => '\textdownarrow', |
'rightarrow' => '\ensuremath{\rightarrow}', |
'darr' => '\textdownarrow', |
8595 => '\ensuremath{\downarrow}', |
|
'darr' => '\ensuremath{\downarrow}', |
8596 => '\ensuremath{\leftrightarrow}', |
8596 => '\ensuremath{\leftrightarrow}', |
'harr' => '\ensuremath{\leftrightarrow}', |
'harr' => '\ensuremath{\leftrightarrow}', |
8598 => '\ensuremath{\nwarrow}', |
8598 => '\ensuremath{\nwarrow}', |
Line 662 my %entities = (
|
Line 696 my %entities = (
|
8669 => '\ensuremath{\rightsquigarrow}', |
8669 => '\ensuremath{\rightsquigarrow}', |
'rarrw' => '\ensuremath{\rightsquigarrow}', |
'rarrw' => '\ensuremath{\rightsquigarrow}', |
|
|
|
|
# Mathematical operators. |
# Mathematical operators. |
|
|
|
|
'forall' => '\ensuremath{\forall}', |
'forall' => '\ensuremath{\forall}', |
8704 => '\ensuremath{\forall}', |
8704 => '\ensuremath{\forall}', |
Line 711 my %entities = (
|
Line 743 my %entities = (
|
8733 => '\ensuremath{\propto}', |
8733 => '\ensuremath{\propto}', |
'infin' => '\ensuremath{\infty}', |
'infin' => '\ensuremath{\infty}', |
8734 => '\ensuremath{\infty}', |
8734 => '\ensuremath{\infty}', |
# |
|
# The items below require the isoent latex package which I can't find at least for FC5. |
# The items below require the isoent latex package which I can't find at least for FC5. |
# Temporarily commented out. |
# Temporarily commented out. |
# |
|
# 'ang90' => '\ensuremath{\sqangle}', |
'ang90' => '\ensuremath{\sqangle}', |
# 8735 => '\ensuremath{\sqangle}', |
8735 => '\ensuremath{\sqangle}', |
|
|
'ang' => '\ensuremath{\angle}', |
'ang' => '\ensuremath{\angle}', |
8736 => '\ensuremath{\angle}', |
8736 => '\ensuremath{\angle}', |
'angmsd' => '\ensuremath{\measuredangle}', |
'angmsd' => '\ensuremath{\measuredangle}', |
Line 756 my %entities = (
|
Line 789 my %entities = (
|
'cong' => '\ensuremath{\cong}', |
'cong' => '\ensuremath{\cong}', |
8773 => '\ensuremath{\cong}', |
8773 => '\ensuremath{\cong}', |
8775 => '\ensuremath{\ncong}', |
8775 => '\ensuremath{\ncong}', |
|
8776 => '\ensuremath{\approx}', |
|
'approx' => '\ensuremath{\approx}', |
8778 => '\ensuremath{\approxeq}', |
8778 => '\ensuremath{\approxeq}', |
|
'approxeq' => '\ensuremath{\approxeq}', |
8784 => '\ensuremath{\doteq}', |
8784 => '\ensuremath{\doteq}', |
8785 => '\ensuremath{\doteqdot}', |
8785 => '\ensuremath{\doteqdot}', |
8786 => '\ensuremath{\fallingdotseq}', |
8786 => '\ensuremath{\fallingdotseq}', |
Line 925 my %entities = (
|
Line 961 my %entities = (
|
|
|
); |
); |
|
|
# There are some named entities that don't have a good |
=pod |
# latex equivalent, these are converted to utf-8 via this table |
|
# of entity name -> unicode number. |
=head1 UNICODE TABLE |
|
|
|
=over |
|
|
|
There are some named entities that don't have a good |
|
latex equivalent, these are converted to utf-8 via this table |
|
of entity name -> unicode number. |
|
|
|
=back |
|
|
|
=cut |
|
|
my %utf_table = ( |
my %utf_table = ( |
'THORN' => 222, |
'THORN' => 222, |
'thorn' => 254, |
'thorn' => 254, |
'eth' => 240 |
'eth' => 240, |
|
'hearts' => 9829 |
); |
); |
|
|
# |
|
# Convert a numerical entity (that does not exist in our hash) |
|
# to its UTF-8 equivalent representation. |
|
# This allows us to support, to some extent, any entity for which |
|
# dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). |
|
# |
|
# Parameters: |
|
# unicode - The unicode for the character. This is assumed to |
|
# be a decimal value |
|
# Returns: |
|
# The UTF-8 equiavalent of the value. |
|
# |
|
sub entity_to_utf8 { |
sub entity_to_utf8 { |
my ($unicode) = @_; |
my ($unicode) = @_; |
my $result = pack("U", $unicode); |
my $result = pack("U", $unicode); |
Line 954 sub entity_to_utf8 {
|
Line 989 sub entity_to_utf8 {
|
} |
} |
|
|
|
|
# |
|
# Convert an entity to the corresponding LateX if possible. |
|
# If not possible, and the entity is numeric, |
|
# the entity is treated like a Unicode character and converted |
|
# to UTF-8 which should display as long as dvipdf can find the |
|
# appropriate glyph. |
|
# |
|
# The entity is assumed to have already had the |
|
# &# ; or & ; removed |
|
# |
|
# Parameters: |
|
# entity - Name of entity to convert. |
|
# Returns: |
|
# One of the following: |
|
# - Latex string that produces the entity. |
|
# - UTF-8 equivalent of a numeric entity for which we don't have a latex string. |
|
# - ' ' for text entities for which there's no latex equivalent. |
|
# |
|
sub entity_to_latex { |
sub entity_to_latex { |
my ($entity) = @_; |
my ($entity) = @_; |
|
|
# Try to look up the entity (text or numeric) in the hash: |
# Try to look up the entity (text or numeric) in the hash: |
|
|
|
|
|
|
my $latex = $entities{"$entity"}; |
my $latex = $entities{"$entity"}; |
if (defined $latex) { |
if (defined $latex) { |
return $latex; |
return $latex; |
Line 999 sub entity_to_latex {
|
Line 1018 sub entity_to_latex {
|
return " "; |
return " "; |
} |
} |
|
|
# |
|
# Convert all the entities in a string. |
|
# We locate all the entities, pass them into entity_to_latex and |
|
# and replace occurences in the input string. |
|
# The assumption is that there are few entities in any string/document |
|
# so this looping is not too bad. The advantage of looping vs. regexping is |
|
# that we now can use lookup tables for the translation in entity_to_latex above. |
|
# |
|
# Parameters: |
|
# input - Input string/document |
|
# Returns |
|
# input with entities replaced by latexable stuff (UTF-8 encodings or |
|
# latex control strings to produce the entity. |
|
# |
|
# |
|
sub replace_entities { |
sub replace_entities { |
my ($input) = @_; |
my ($input) = @_; |
my $start; |
my $start; |
Line 1030 sub replace_entities {
|
Line 1035 sub replace_entities {
|
$latex = &entity_to_latex($entity); |
$latex = &entity_to_latex($entity); |
substr($input, $start, $end-$start) = $latex; |
substr($input, $start, $end-$start) = $latex; |
} |
} |
|
|
|
# Hexadecimal entities: |
|
|
|
while ($input =~ /&\#x(\d|[a-f,A-f])+;/) { |
|
($start) = @-; |
|
($end) = @+; |
|
$entity = "0" . substr($input, $start+2, $end-$start-3); # 0xhexnumber |
|
$latex = &entity_to_latex(hex($entity)); |
|
substr($input, $start, $end-$start) = $latex; |
|
} |
|
|
|
|
# Now the &text; entites; |
# Now the &text; entites; |
|
|
while ($input =~/(&\w+;)/) { |
while ($input =~/(&\w+;)/) { |
Line 1046 sub replace_entities {
|
Line 1063 sub replace_entities {
|
1; |
1; |
|
|
__END__ |
__END__ |
|
|
|
=pod |
|
|
|
=head1 NAME |
|
|
|
Apache::entities.pm |
|
|
|
=head1 SYNOPSIS |
|
|
|
This file contains a table driven entity-->latex converter. |
|
|
|
This is part of the LearningOnline Network with CAPA project |
|
described at http://www.lon-capa.org. |
|
|
|
=head1 OVERVIEW |
|
|
|
|
|
Assumptions: |
|
The number of entities in a resource is small compared with the |
|
number of possible entities that might be translated. |
|
Therefore the strategy is to match a general entity pattern |
|
&.+; over and over, pull out the match look it up in an entity -> tex hash |
|
and do the replacement. |
|
|
|
In order to simplify the hash, the following reductions are done: |
|
&#d+; have the &# and ; stripped and is converted to an int. |
|
&#.+; have the &#x and ; stripped and is converted to an int as a hex |
|
value. |
|
All others have the & and ; stripped. |
|
|
|
|
|
The hash: Add new conversions here; leave off the leading & and the trailing ; |
|
all numeric entities need only appear as their decimal versions |
|
(e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. |
|
|
|
This entity table is mercilessly cribbed from the HTML pocket reference |
|
table starting at pg 82. In most cases the LaTeX equivalent codes come from |
|
the original massive regular expression replacements originally by |
|
A. Sakharuk in lonprintout.pm |
|
|
|
I also want to acknowledge |
|
ISO Character entities and their LaTeX equivalents by |
|
Vidar Bronken Gundersen, and Rune Mathisen |
|
http://www.bitjungle.com/isoent-ref.pdf |
|
|
|
|
|
Note numerical entities are essentially unicode character codes. |
|
|
|
|
|
=head1 SUBROUTINES |
|
|
|
=over |
|
|
|
=item entity_to_utf8() |
|
|
|
|
|
Convert a numerical entity (that does not exist in our hash) |
|
to its UTF-8 equivalent representation. |
|
This allows us to support, to some extent, any entity for which |
|
dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). |
|
|
|
Parameters: |
|
unicode - The unicode for the character. This is assumed to |
|
be a decimal value |
|
Returns: |
|
The UTF-8 equiavalent of the value. |
|
|
|
=item entity_to_latex() |
|
|
|
Convert an entity to the corresponding LateX if possible. |
|
If not possible, and the entity is numeric, |
|
the entity is treated like a Unicode character and converted |
|
to UTF-8 which should display as long as dvipdf can find the |
|
appropriate glyph. |
|
|
|
The entity is assumed to have already had the |
|
&; or & ; removed |
|
|
|
Parameters: |
|
entity - Name of entity to convert. |
|
Returns: |
|
One of the following: |
|
- Latex string that produces the entity. |
|
- UTF-8 equivalent of a numeric entity for which we don't have a latex string. |
|
- ' ' for text entities for which there's no latex equivalent. |
|
|
|
|
|
=item replace_entities() |
|
|
|
Convert all the entities in a string. |
|
We locate all the entities, pass them into entity_to_latex and |
|
and replace occurences in the input string. |
|
The assumption is that there are few entities in any string/document |
|
so this looping is not too bad. The advantage of looping vs. regexping is |
|
that we now can use lookup tables for the translation in entity_to_latex above. |
|
|
|
Parameters: |
|
input - Input string/document |
|
Returns |
|
input with entities replaced by latexable stuff (UTF-8 encodings or |
|
latex control strings to produce the entity. |
|
|
|
=back |
|
|
|
=cut |