1 | /* This file is part of the project "Hilbert II" - http://www.qedeq.org |
2 | * |
3 | * Copyright 2000-2014, Michael Meyling <mime@qedeq.org>. |
4 | * |
5 | * "Hilbert II" is free software; you can redistribute |
6 | * it and/or modify it under the terms of the GNU General Public |
7 | * License as published by the Free Software Foundation; either |
8 | * version 2 of the License, or (at your option) any later version. |
9 | * |
10 | * This program is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13 | * GNU General Public License for more details. |
14 | */ |
15 | |
16 | package org.qedeq.kernel.bo.parser; |
17 | |
18 | import java.util.ArrayList; |
19 | import java.util.List; |
20 | |
21 | import org.qedeq.base.trace.Trace; |
22 | |
23 | /* |
24 | * TODO mime 20080118: refactor |
25 | * |
26 | * Whitespace LaTeX form, could be eaten |
27 | * \t |
28 | * \r |
29 | * \n |
30 | * \\ |
31 | * \\, |
32 | * & |
33 | * \\\\ |
34 | * \\par |
35 | * \\quad |
36 | * \\qquad |
37 | * |
38 | * Separator only one allowed, before and after only whitespace is possible |
39 | * , |
40 | * | |
41 | * $$ |
42 | * Separator should be read as tokens. |
43 | * |
44 | * Problem: If some atom like is followed by "(" it should be taken as an |
45 | * (function) operator. But if we start with readToken we don't see the "(" |
46 | * character. |
47 | * |
48 | * Problem: Could whitespace be recognized? |
49 | * Translating whitespace tokens into spaces is not easy, one has to know the |
50 | * end of the whitespace token. |
51 | * Possible solution: |
52 | * function read token (LaTeX specific) |
53 | * skip real whitespace (" ", "\t", "\r", "\n") |
54 | * read char |
55 | * case char |
56 | * "\\" read characters or numbers (check LaTeX Syntax) |
57 | * "{", "}", "(", ")" are also allowed |
58 | * resulting string is token |
59 | * LaTeX command definition modifies above: |
60 | * Die meisten LATEX-Befehle haben eines der beiden folgenden Formate: Entweder sie beginnen |
61 | * mit einem Backslash (\) und haben dann einen nur aus Buchstaben bestehenden Namen, der durch |
62 | * ein oder mehrere Leerzeichen oder durch ein nachfolgendes Sonderzeichen oder eine Ziffer beendet |
63 | * wird; oder sie bestehen aus einem Backslash und genau einem Sonderzeichen oder einer Ziffer. |
64 | * Gross- und Kleinbuchstaben haben auch in Befehlsnamen verschiedene Bedeutung. Wenn man nach |
65 | * einem Befehlsnamen eine Leerstelle erhalten will, muss man "{}" zur Beendigung des Befehlsnamens |
66 | * oder einen eigenen Befehl f\u00fcr die Leerstelle verwenden. |
67 | */ |
68 | |
69 | /** |
70 | * Parse LaTeX term or formula data into {@link org.qedeq.kernel.bo.parser.Term}s. |
71 | * |
72 | * @author Michael Meyling |
73 | */ |
74 | public class LatexMathParser extends MathParser { |
75 | |
76 | /** This class. */ |
77 | private static final Class CLASS = LatexMathParser.class; |
78 | |
79 | /** Characters with special LaTeX meaning. */ |
80 | private static final String SPECIALCHARACTERS = "(),{}\\~%$&"; |
81 | |
82 | /** Counter for token whitespace lines. */ |
83 | private int tokenWhiteSpaceLines; |
84 | |
85 | /** |
86 | * Constructor. |
87 | * |
88 | */ |
89 | public LatexMathParser() { |
90 | super(); |
91 | } |
92 | |
93 | protected final String readToken() { |
94 | final String method = "readToken()"; |
95 | Trace.begin(CLASS, this, method); |
96 | StringBuffer token = new StringBuffer(); |
97 | tokenWhiteSpaceLines = 0; |
98 | try { |
99 | do { |
100 | tokenWhiteSpaceLines += readPureWhitespace(); |
101 | if (tokenWhiteSpaceLines > 1) { |
102 | break; |
103 | } |
104 | if (eof()) { |
105 | if (token.length() <= 0) { |
106 | token = null; |
107 | } |
108 | break; |
109 | } |
110 | final int c = getChar(); |
111 | if (Character.isDigit((char) c)) { |
112 | token.append((char) readChar()); |
113 | if (Character.isDigit((char) getChar())) { |
114 | continue; |
115 | } |
116 | break; |
117 | } |
118 | if (SPECIALCHARACTERS.indexOf(c) >= 0) { |
119 | switch (c) { |
120 | case '&': |
121 | case '%': |
122 | case '~': |
123 | case '$': // TODO mime 20060504 or break in this case? |
124 | readChar(); |
125 | continue; |
126 | case '\\': |
127 | final String t = readBackslashToken(); |
128 | if (t.equals(" ") || t.equals("quad") || t.equals("qquad")) { |
129 | continue; |
130 | } |
131 | token.append(t); |
132 | if ('_' == getChar() || '^' == getChar()) { |
133 | token.append((char) readChar()); |
134 | continue; |
135 | } |
136 | break; |
137 | case '{': |
138 | readChar(); |
139 | token.append("("); |
140 | break; |
141 | case '}': |
142 | readChar(); |
143 | token.append(")"); |
144 | break; |
145 | default: |
146 | readChar(); |
147 | token.append((char) c); |
148 | if ('_' == getChar() || '^' == getChar()) { |
149 | token.append((char) readChar()); |
150 | continue; |
151 | } |
152 | } |
153 | break; |
154 | } |
155 | token.append((char) readChar()); |
156 | if ('_' == getChar() || '^' == getChar()) { |
157 | token.append((char) readChar()); |
158 | continue; |
159 | } |
160 | break; |
161 | /* |
162 | String operator = null; |
163 | markPosition(); |
164 | while (!eof() && (Character.isLetterOrDigit((char) getChar()) || '_' == getChar() |
165 | || '^' == getChar())) { |
166 | token.append((char) readChar()); |
167 | if (null != getOperator(token.toString())) { |
168 | operator = token.toString(); |
169 | clearMark(); |
170 | markPosition(); |
171 | } |
172 | } |
173 | if (operator != null) { |
174 | rewindPosition(); |
175 | token.setLength(0); |
176 | token.append(operator); |
177 | } else { |
178 | clearMark(); |
179 | } |
180 | */ |
181 | } while (!eof()); |
182 | Trace.param(CLASS, this, method, "return token", token); |
183 | return (token != null ? token.toString() : null); |
184 | } finally { |
185 | Trace.end(CLASS, this, method); |
186 | } |
187 | } |
188 | |
189 | private String readBackslashToken() { |
190 | final String method = "readBackslashToken()"; |
191 | Trace.begin(CLASS, this, method); |
192 | if (getChar() != '\\') { |
193 | throw new IllegalArgumentException("\\ expected"); |
194 | } |
195 | readChar(); // read \ |
196 | if (eof()) { |
197 | Trace.param(CLASS, this, method, "return", null); |
198 | Trace.end(CLASS, this, method); |
199 | return null; |
200 | } |
201 | if (!Character.isLetter((char) getChar())) { |
202 | Trace.param(CLASS, this, method, "return", (char) getChar()); |
203 | Trace.end(CLASS, this, method); |
204 | return "" + ((char) readChar()); |
205 | } |
206 | final StringBuffer buffer = new StringBuffer(); |
207 | do { |
208 | buffer.append((char) readChar()); |
209 | } while (!eof() && Character.isLetter((char) getChar())); |
210 | Trace.param(CLASS, this, method, "return", buffer.toString()); |
211 | Trace.end(CLASS, this, method); |
212 | return buffer.toString(); |
213 | } |
214 | |
215 | private int readPureWhitespace() { |
216 | int lines = 0; |
217 | while (getChar() != -1 && Character.isWhitespace((char) getChar())) { |
218 | if ('\n' == (char) getChar()) { |
219 | lines++; |
220 | } |
221 | readChar(); |
222 | } |
223 | return lines; |
224 | } |
225 | |
226 | protected final Operator getOperator(final String token) { |
227 | Operator result = null; |
228 | if (token == null) { |
229 | return null; |
230 | } |
231 | for (int i = 0; i < getOperators().size(); i++) { |
232 | if (token.equals(((Operator) getOperators().get(i)).getStartSymbol())) { |
233 | result = (Operator) getOperators().get(i); |
234 | break; |
235 | } |
236 | } |
237 | if (result != null) { |
238 | return result; |
239 | } |
240 | // mime 20080725: no operator found -> return subject variable |
241 | if (SPECIALCHARACTERS.indexOf(token) < 0) { |
242 | return new Operator(token, null, null, "VAR", token, 200, 0, 0); |
243 | } |
244 | return null; |
245 | } |
246 | |
247 | protected final List getOperators(final String token) { |
248 | final List result = new ArrayList(); |
249 | if (token == null) { |
250 | return result; |
251 | } |
252 | for (int i = 0; i < getOperators().size(); i++) { |
253 | if (token.equals(((Operator) getOperators().get(i)).getStartSymbol())) { |
254 | result.add(getOperators().get(i)); |
255 | } |
256 | } |
257 | // mime 20080725: no operator found -> return subject variable |
258 | if (result.size() <= 0 && SPECIALCHARACTERS.indexOf(token) < 0) { |
259 | result.add(new Operator(token, null, null, "VAR", token, 200, 0, 0)); |
260 | } |
261 | return result; |
262 | } |
263 | |
264 | protected boolean eot(final String token) { |
265 | return token == null || token.trim().length() == 0; |
266 | } |
267 | |
268 | } |