version 1.13, 2008/11/17 13:52:39
|
version 1.23, 2019/02/23 20:05:50
|
Line 1
|
Line 1
|
# The LearningOnline Network |
# The LearningOnline Network |
# entity -> tex. |
# entity -> tex. |
# |
# |
# |
# $Id$ |
# |
# |
# Copyright Michigan State University Board of Trustees |
# Copyright Michigan State University Board of Trustees |
# |
# |
Line 26
|
Line 26
|
# |
# |
# |
# |
|
|
|
package Apache::entities; |
|
use strict; |
|
|
=head1 NAME |
=pod |
|
|
Apache::entities.pm |
=head1 TABLES ASCII code page |
|
|
=head1 SYNOPSIS |
=over |
|
|
This file contains a table driven entity-->latex converter. |
=item (7-13) |
|
|
This is part of the LearningOnline Network with CAPA project |
Translation to empty strings |
described at http://www.lon-capa.org. |
|
|
|
=head1 OVERVIEW |
=item (32-126) |
|
|
|
Translations to simple characters |
|
|
Assumptions: |
=item (130-140) |
The number of entities in a resource is small compared with the |
|
number of possible entities that might be translated. |
|
Therefore the strategy is to match a general entity pattern |
|
&.+; over and over, pull out the match look it up in an entity -> tex hash |
|
and do the replacement. |
|
|
|
In order to simplify the hash, the following reductions are done: |
Controls and Latin-1 supplement. Note that some entities that have |
&#d+; have the &# and ; stripped and is converted to an int. |
visible effect are not printing unicode characters. Specifically |
&#.+; have the &#x and ; stripped and is converted to an int as a hex |
‚-  |
value. |
|
All others have the & and ; stripped. |
|
|
|
|
=item (145-156) |
|
|
The hash: Add new conversions here; leave off the leading & and the trailing ; |
There's a gap here in my entity table |
all numeric entities need only appear as their decimal versions |
|
(e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. |
|
|
|
This entity table is mercilessly cribbed from the HTML pocket reference |
=item (159-255) |
table starting at pg 82. In most cases the LaTeX equivalent codes come from |
|
the original massive regular expression replacements originally by |
|
A. Sakharuk in lonprintout.pm |
|
|
|
I also want to acknowledge |
Another short gap |
ISO Character entities and their LaTeX equivalents by |
|
Vidar Bronken Gundersen, and Rune Mathisen |
|
http://www.bitjungle.com/isoent-ref.pdf |
|
|
|
|
=item (295) |
|
|
Note numerical entities are essentially unicode character codes. |
hbar entity number comes from the unicode character: |
|
see e.g. http://www.unicode.org/charts/PDF/U0100.pdf |
|
ISO also documents a 'planck' entity. |
|
|
|
=item (338-376) |
|
|
=head1 SUBROUTINES |
Latin extended-A HTML 4.01 entities |
|
|
=item entity_to_utf8() |
=item (402) |
|
|
|
Latin extended B HTML 4.01 entities |
|
|
Convert a numerical entity (that does not exist in our hash) |
=item (710 & 732) |
to its UTF-8 equivalent representation. |
|
This allows us to support, to some extent, any entity for which |
|
dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). |
|
|
|
Parameters: |
Spacing modifier letters |
unicode - The unicode for the character. This is assumed to |
|
be a decimal value |
|
Returns: |
|
The UTF-8 equiavalent of the value. |
|
|
|
=item entity_to_latex() |
=item (913-937) |
|
|
Convert an entity to the corresponding LateX if possible. |
Greek uppercase (skipss 930) |
If not possible, and the entity is numeric, |
|
the entity is treated like a Unicode character and converted |
|
to UTF-8 which should display as long as dvipdf can find the |
|
appropriate glyph. |
|
|
|
The entity is assumed to have already had the |
=item (945-982) |
&; or & ; removed |
|
|
|
Parameters: |
Greek lowercase |
entity - Name of entity to convert. |
|
Returns: |
|
One of the following: |
|
- Latex string that produces the entity. |
|
- UTF-8 equivalent of a numeric entity for which we don't have a latex string. |
|
- ' ' for text entities for which there's no latex equivalent. |
|
|
|
|
=item (8194-8364) |
|
|
=item replace_entities() |
The general punctuation set |
|
|
Convert all the entities in a string. |
=item (8462-8501) |
We locate all the entities, pass them into entity_to_latex and |
|
and replace occurences in the input string. |
|
The assumption is that there are few entities in any string/document |
|
so this looping is not too bad. The advantage of looping vs. regexping is |
|
that we now can use lookup tables for the translation in entity_to_latex above. |
|
|
|
Parameters: |
Letter like symbols |
input - Input string/document |
|
Returns |
|
input with entities replaced by latexable stuff (UTF-8 encodings or |
|
latex control strings to produce the entity. |
|
|
|
=head1 TABLES ASCII code page |
=item (8592-8669) |
|
|
=cut |
Arrows and then some (harpoons from Hon Kie). |
|
|
|
=item (8704-8734) |
|
|
package Apache::entities; |
Mathematical operators. |
use strict; |
|
|
|
package Apache::entities; |
=item (8735-9830) |
|
|
my %entities = ( |
The items below require the isoent latex package which I can't find at least for FC5. |
|
Temporarily commented out. |
|
|
=pod |
=back |
|
|
=out |
=cut |
|
|
=item (7-13) |
my %entities = ( |
|
|
# Translation to empty strings: |
# Translation to empty strings: |
=cut |
|
|
|
7 => "", |
7 => "", |
9 => "", |
9 => "", |
10 => "", |
10 => "", |
13 => "", |
13 => "", |
|
|
=pod |
|
|
|
=item (32-126) |
|
|
|
# Translations to simple characters: |
# Translations to simple characters: |
|
|
=cut |
|
|
|
32 => " ", |
32 => " ", |
33 => "!", |
33 => "!", |
34 => '"', |
34 => '"', |
Line 258 my %entities = (
|
Line 219 my %entities = (
|
125 => '\}', |
125 => '\}', |
126 => '\~', |
126 => '\~', |
|
|
=pod |
# Controls and Latin-1 supplement. |
|
|
=item (130-140) |
|
|
|
Controls and Latin-1 supplement. Note that some entities that have |
|
visible effect are not printing unicode characters. Specifically |
|
‚-  |
|
|
|
=cut |
|
|
|
130 => ',', |
130 => ',', |
131 => '\ensuremath{f}', |
131 => '\ensuremath{f}', |
Line 280 my %entities = (
|
Line 233 my %entities = (
|
139 => '\ensuremath{<}', |
139 => '\ensuremath{<}', |
140 => '{\OE}', |
140 => '{\OE}', |
|
|
=pod |
# There's a gap here in my entity table |
|
|
=item (145-156) |
|
|
|
There's a gap here in my entity table |
|
|
|
=cut |
|
|
|
145 => '`', |
145 => '`', |
146 => '\'', |
146 => '\'', |
Line 301 my %entities = (
|
Line 248 my %entities = (
|
155 => '\ensuremath{>}', |
155 => '\ensuremath{>}', |
156 => '\oe ', |
156 => '\oe ', |
|
|
=pod |
# Another short gap: |
|
|
=item (159-255) |
|
|
|
Another short gap: |
|
|
|
=cut |
|
|
|
159 => '\"Y', |
159 => '\"Y', |
160 => '~', |
160 => '~', |
Line 498 my %entities = (
|
Line 439 my %entities = (
|
'yuml' => '\\"{y}', |
'yuml' => '\\"{y}', |
|
|
|
|
=pod |
# hbar entity number comes from the unicode character: |
|
|
=item (295) |
|
|
|
hbar entity number comes from the unicode charater: |
|
see e.g. http://www.unicode.org/charts/PDF/U0100.pdf |
|
ISO also documents a 'planck' entity. |
|
|
|
=cut |
|
|
|
295 => '\ensuremath{\hbar}', |
295 => '\ensuremath{\hbar}', |
'planck' => '\ensuremath{\hbar}', |
'planck' => '\ensuremath{\hbar}', |
|
|
=pod |
# Latin extended-A HTML 4.01 entities: |
|
|
=item (338-376) |
|
|
|
Latin extended-A HTML 4.01 entities: |
|
|
|
=cut |
|
|
|
338 => '\OE', |
338 => '\OE', |
'OElig' => '\OE', |
'OElig' => '\OE', |
Line 530 my %entities = (
|
Line 457 my %entities = (
|
376 => '\\"{Y}', |
376 => '\\"{Y}', |
'Yuml' => '\\"{Y}', |
'Yuml' => '\\"{Y}', |
|
|
=pod |
# Latin extended B HTML 4.01 entities |
|
|
=item (402) |
|
|
|
Latin extended B HTML 4.01 entities |
|
|
|
=cut |
|
|
|
402 => '\ensuremath{f}', |
402 => '\ensuremath{f}', |
'fnof' => '\ensuremath{f}', |
'fnof' => '\ensuremath{f}', |
|
|
=pod |
# Spacing modifier letters: |
|
|
=item (710 & 732) |
|
|
|
Spacing modifier letters: |
|
|
|
=cut |
|
|
|
710 => '\^{}', |
710 => '\^{}', |
'circ' => '\^{}', |
'circ' => '\^{}', |
732 => '\~{}', |
732 => '\~{}', |
'tilde' => '\~{}', |
'tilde' => '\~{}', |
|
|
=pod |
# Greek uppercase: |
|
|
=item (913-929) |
|
|
|
Greek uppercase: |
|
|
|
=cut |
|
|
|
913 => '\ensuremath{\mathrm{A}}', |
913 => '\ensuremath{\mathrm{A}}', |
'Alpha' => '\ensuremath{\mathrm{A}}', |
'Alpha' => '\ensuremath{\mathrm{A}}', |
Line 596 my %entities = (
|
Line 505 my %entities = (
|
'Pi' => '\ensuremath{\Pi}', |
'Pi' => '\ensuremath{\Pi}', |
929 => '\ensuremath{\mathrm{P}}', |
929 => '\ensuremath{\mathrm{P}}', |
'Rho' => '\ensuremath{\mathrm{P}}', |
'Rho' => '\ensuremath{\mathrm{P}}', |
|
|
|
|
=pod |
|
|
|
=item (931-937) |
|
|
|
Skips 930 |
|
|
|
=cut |
|
|
|
931 => '\ensuremath{\Sigma}', |
931 => '\ensuremath{\Sigma}', |
'Sigma' => '\ensuremath{\Sigma}', |
'Sigma' => '\ensuremath{\Sigma}', |
932 => '\ensuremath{\mathrm{T}}', |
932 => '\ensuremath{\mathrm{T}}', |
Line 621 my %entities = (
|
Line 520 my %entities = (
|
937 => '\ensuremath{\Omega}', |
937 => '\ensuremath{\Omega}', |
'Omega' => '\ensuremath{\Omega}', |
'Omega' => '\ensuremath{\Omega}', |
|
|
=pod |
# Greek lowercase: |
|
|
=item (945-982) |
|
|
|
Greek lowercase: |
|
|
|
=cut |
|
|
|
945 => '\ensuremath{\alpha}', |
945 => '\ensuremath{\alpha}', |
'alpha' => '\ensuremath{\alpha}', |
'alpha' => '\ensuremath{\alpha}', |
Line 686 my %entities = (
|
Line 579 my %entities = (
|
982 => '\ensuremath{\varpi}', |
982 => '\ensuremath{\varpi}', |
'piv' => '\ensuremath{\varpi}', |
'piv' => '\ensuremath{\varpi}', |
|
|
=pod |
# The general punctuation set: |
|
|
=item (8194-8364) |
|
|
|
The general punctuation set: |
|
|
|
=cut |
|
|
|
8194, => '\hspace{.5em}', |
8194, => '\hspace{.5em}', |
'enspc' => '\hspace{.5em}', |
'enspc' => '\hspace{.5em}', |
Line 749 my %entities = (
|
Line 636 my %entities = (
|
8364 => '\texteuro', |
8364 => '\texteuro', |
'euro' => '\texteuro', |
'euro' => '\texteuro', |
|
|
=pod |
# Letter like symbols |
|
|
=item (8472-8501) |
8462 => '\ensuremath{h}', |
|
'planckh' => '\ensuremath{h}', |
Letter like symbols |
8463 => '\ensuremath{\hbar}', |
|
'hbar' => '\ensuremath{\hbar}', |
=cut |
|
|
|
|
|
8472 => '\ensuremath{\wp}', |
8472 => '\ensuremath{\wp}', |
'weierp' => '\ensuremath{\wp}', |
'weierp' => '\ensuremath{\wp}', |
8465 => '\ensuremath{\Im}', |
8465 => '\ensuremath{\Im}', |
Line 766 my %entities = (
|
Line 650 my %entities = (
|
'real' => '\ensuremath{\Re}', |
'real' => '\ensuremath{\Re}', |
8482 => '\texttrademark', |
8482 => '\texttrademark', |
'trade' => '\texttrademark', |
'trade' => '\texttrademark', |
|
8496 => '\ensuremath{\mathcal{E}}', |
|
'expectation' => '\ensuremath{\mathcal{E}}', |
8501 => '\ensuremath{\aleph}', |
8501 => '\ensuremath{\aleph}', |
'alefsym'=> '\ensuremath{\aleph}', |
'alefsym'=> '\ensuremath{\aleph}', |
|
|
=pod |
# Arrows and then some (harpoons from Hon Kie). |
|
|
=item (8592-8669) |
|
|
|
Arrows and then some (harpoons from Hon Kie). |
|
|
|
=cut |
8592 => '\ensuremath{\leftarrow}', |
|
'larr' => '\ensuremath{\leftarrow}', |
8592 => '\textleftarrow', |
8593 => '\ensuremath{\uparrow}', |
'larr' => '\textleftarrow', |
'uarr' => '\ensuremath{\uparrow}', |
8593 => '\textuparrow', |
8594 => '\ensuremath{\rightarrow}', |
'uarr' => '\textuparrow', |
'rarr' => '\ensuremath{\rightarrow}', |
8594 => '\textrightarrow', |
'rightarrow' => '\ensuremath{\rightarrow}', |
'rarr' => '\textrightarrow', |
8595 => '\ensuremath{\downarrow}', |
8595 => '\textdownarrow', |
'darr' => '\ensuremath{\downarrow}', |
'darr' => '\textdownarrow', |
|
8596 => '\ensuremath{\leftrightarrow}', |
8596 => '\ensuremath{\leftrightarrow}', |
'harr' => '\ensuremath{\leftrightarrow}', |
'harr' => '\ensuremath{\leftrightarrow}', |
8598 => '\ensuremath{\nwarrow}', |
8598 => '\ensuremath{\nwarrow}', |
Line 821 my %entities = (
|
Line 702 my %entities = (
|
8669 => '\ensuremath{\rightsquigarrow}', |
8669 => '\ensuremath{\rightsquigarrow}', |
'rarrw' => '\ensuremath{\rightsquigarrow}', |
'rarrw' => '\ensuremath{\rightsquigarrow}', |
|
|
=pod |
# Mathematical operators. |
|
|
=item (8704-8734) |
|
|
|
Mathematical operators. |
|
|
|
=cut |
|
|
|
|
|
'forall' => '\ensuremath{\forall}', |
'forall' => '\ensuremath{\forall}', |
8704 => '\ensuremath{\forall}', |
8704 => '\ensuremath{\forall}', |
Line 876 my %entities = (
|
Line 750 my %entities = (
|
'infin' => '\ensuremath{\infty}', |
'infin' => '\ensuremath{\infty}', |
8734 => '\ensuremath{\infty}', |
8734 => '\ensuremath{\infty}', |
|
|
|
# The items below require the isoent latex package which I can't find at least for FC5. |
=pod |
# Temporarily commented out. |
|
|
=item (8735-9830) |
|
|
|
|
|
The items below require the isoent latex package which I can't find at least for FC5. |
|
Temporarily commented out. |
|
|
|
'ang90' => '\ensuremath{\sqangle}', |
'ang90' => '\ensuremath{\sqangle}', |
8735 => '\ensuremath{\sqangle}', |
8735 => '\ensuremath{\sqangle}', |
|
|
=cut |
|
|
|
'ang' => '\ensuremath{\angle}', |
'ang' => '\ensuremath{\angle}', |
8736 => '\ensuremath{\angle}', |
8736 => '\ensuremath{\angle}', |
'angmsd' => '\ensuremath{\measuredangle}', |
'angmsd' => '\ensuremath{\measuredangle}', |
Line 929 my %entities = (
|
Line 795 my %entities = (
|
'cong' => '\ensuremath{\cong}', |
'cong' => '\ensuremath{\cong}', |
8773 => '\ensuremath{\cong}', |
8773 => '\ensuremath{\cong}', |
8775 => '\ensuremath{\ncong}', |
8775 => '\ensuremath{\ncong}', |
|
8776 => '\ensuremath{\approx}', |
|
'approx' => '\ensuremath{\approx}', |
8778 => '\ensuremath{\approxeq}', |
8778 => '\ensuremath{\approxeq}', |
|
'approxeq' => '\ensuremath{\approxeq}', |
8784 => '\ensuremath{\doteq}', |
8784 => '\ensuremath{\doteq}', |
8785 => '\ensuremath{\doteqdot}', |
8785 => '\ensuremath{\doteqdot}', |
8786 => '\ensuremath{\fallingdotseq}', |
8786 => '\ensuremath{\fallingdotseq}', |
Line 1100 my %entities = (
|
Line 969 my %entities = (
|
|
|
=pod |
=pod |
|
|
=item * |
=head1 UNICODE TABLE |
|
|
|
=over |
|
|
There are some named entities that don't have a good |
There are some named entities that don't have a good |
latex equivalent, these are converted to utf-8 via this table |
latex equivalent, these are converted to utf-8 via this table |
of entity name -> unicode number. |
of entity name -> unicode number. |
|
|
|
=back |
|
|
=cut |
=cut |
|
|
my %utf_table = ( |
my %utf_table = ( |
Line 1199 __END__
|
Line 1072 __END__
|
|
|
=pod |
=pod |
|
|
|
=head1 NAME |
|
|
|
Apache::entities.pm |
|
|
|
=head1 SYNOPSIS |
|
|
|
This file contains a table driven entity-->latex converter. |
|
|
|
This is part of the LearningOnline Network with CAPA project |
|
described at http://www.lon-capa.org. |
|
|
|
=head1 OVERVIEW |
|
|
|
|
|
Assumptions: |
|
The number of entities in a resource is small compared with the |
|
number of possible entities that might be translated. |
|
Therefore the strategy is to match a general entity pattern |
|
&.+; over and over, pull out the match look it up in an entity -> tex hash |
|
and do the replacement. |
|
|
|
In order to simplify the hash, the following reductions are done: |
|
&#d+; have the &# and ; stripped and is converted to an int. |
|
&#.+; have the &#x and ; stripped and is converted to an int as a hex |
|
value. |
|
All others have the & and ; stripped. |
|
|
|
|
|
The hash: Add new conversions here; leave off the leading & and the trailing ; |
|
all numeric entities need only appear as their decimal versions |
|
(e.g. no need for 1234 is sufficient, no need for 0x4d2 as well. |
|
|
|
This entity table is mercilessly cribbed from the HTML pocket reference |
|
table starting at pg 82. In most cases the LaTeX equivalent codes come from |
|
the original massive regular expression replacements originally by |
|
A. Sakharuk in lonprintout.pm |
|
|
|
I also want to acknowledge |
|
ISO Character entities and their LaTeX equivalents by |
|
Vidar Bronken Gundersen, and Rune Mathisen |
|
http://www.bitjungle.com/isoent-ref.pdf |
|
|
|
|
|
Note numerical entities are essentially unicode character codes. |
|
|
|
|
|
=head1 SUBROUTINES |
|
|
|
=over |
|
|
|
=item entity_to_utf8() |
|
|
|
|
|
Convert a numerical entity (that does not exist in our hash) |
|
to its UTF-8 equivalent representation. |
|
This allows us to support, to some extent, any entity for which |
|
dvipdf can find a gylph (given that LaTeX is now UTF-8 clean). |
|
|
|
Parameters: |
|
unicode - The unicode for the character. This is assumed to |
|
be a decimal value |
|
Returns: |
|
The UTF-8 equiavalent of the value. |
|
|
|
=item entity_to_latex() |
|
|
|
Convert an entity to the corresponding LateX if possible. |
|
If not possible, and the entity is numeric, |
|
the entity is treated like a Unicode character and converted |
|
to UTF-8 which should display as long as dvipdf can find the |
|
appropriate glyph. |
|
|
|
The entity is assumed to have already had the |
|
&; or & ; removed |
|
|
|
Parameters: |
|
entity - Name of entity to convert. |
|
Returns: |
|
One of the following: |
|
- Latex string that produces the entity. |
|
- UTF-8 equivalent of a numeric entity for which we don't have a latex string. |
|
- ' ' for text entities for which there's no latex equivalent. |
|
|
|
|
|
=item replace_entities() |
|
|
|
Convert all the entities in a string. |
|
We locate all the entities, pass them into entity_to_latex and |
|
and replace occurences in the input string. |
|
The assumption is that there are few entities in any string/document |
|
so this looping is not too bad. The advantage of looping vs. regexping is |
|
that we now can use lookup tables for the translation in entity_to_latex above. |
|
|
|
Parameters: |
|
input - Input string/document |
|
Returns |
|
input with entities replaced by latexable stuff (UTF-8 encodings or |
|
latex control strings to produce the entity. |
|
|
=back |
=back |
|
|
=cut |
=cut |