001 /* This file is part of the project "Hilbert II" - http://www.qedeq.org
002 *
003 * Copyright 2000-2011, Michael Meyling <mime@qedeq.org>.
004 *
005 * "Hilbert II" is free software; you can redistribute
006 * it and/or modify it under the terms of the GNU General Public
007 * License as published by the Free Software Foundation; either
008 * version 2 of the License, or (at your option) any later version.
009 *
010 * This program is distributed in the hope that it will be useful,
011 * but WITHOUT ANY WARRANTY; without even the implied warranty of
012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
013 * GNU General Public License for more details.
014 */
015
016 package org.qedeq.kernel.bo.parser;
017
018 import java.util.ArrayList;
019 import java.util.List;
020
021 import org.qedeq.base.trace.Trace;
022
023 /*
024 * TODO mime 20080118: refactor
025 *
026 * Whitespace LaTeX form, could be eaten
027 * \t
028 * \r
029 * \n
030 * \\
031 * \\,
032 * &
033 * \\\\
034 * \\par
035 * \\quad
036 * \\qquad
037 *
038 * Separator only one allowed, before and after only whitespace is possible
039 * ,
040 * |
041 * $$
042 * Separator should be read as tokens.
043 *
044 * Problem: If some atom like is followed by "(" it should be taken as an
045 * (function) operator. But if we start with readToken we don't see the "("
046 * character.
047 *
048 * Problem: Could whitespace be recognized?
049 * Translating whitespace tokens into spaces is not easy, one has to know the
050 * end of the whitespace token.
051 * Possible solution:
052 * function read token (LaTeX specific)
053 * skip real whitespace (" ", "\t", "\r", "\n")
054 * read char
055 * case char
056 * "\\" read characters or numbers (check LaTeX Syntax)
057 * "{", "}", "(", ")" are also allowed
058 * resulting string is token
059 * LaTeX command definition modifies above:
060 * Die meisten LATEX-Befehle haben eines der beiden folgenden Formate: Entweder sie beginnen
061 * mit einem Backslash (\) und haben dann einen nur aus Buchstaben bestehenden Namen, der durch
062 * ein oder mehrere Leerzeichen oder durch ein nachfolgendes Sonderzeichen oder eine Ziffer beendet
063 * wird; oder sie bestehen aus einem Backslash und genau einem Sonderzeichen oder einer Ziffer.
064 * Gross- und Kleinbuchstaben haben auch in Befehlsnamen verschiedene Bedeutung. Wenn man nach
065 * einem Befehlsnamen eine Leerstelle erhalten will, muss man "{}" zur Beendigung des Befehlsnamens
066 * oder einen eigenen Befehl f\u00fcr die Leerstelle verwenden.
067 */
068
069 /**
070 * Parse LaTeX term or formula data into {@link org.qedeq.kernel.bo.parser.Term}s.
071 *
072 * @author Michael Meyling
073 */
074 public class LatexMathParser extends MathParser {
075
076 /** This class. */
077 private static final Class CLASS = LatexMathParser.class;
078
079 /** Characters with special LaTeX meaning. */
080 private static final String SPECIALCHARACTERS = "(),{}\\~%$&";
081
082 /** Counter for token whitespace lines. */
083 private int tokenWhiteSpaceLines;
084
085 /**
086 * Constructor.
087 *
088 */
089 public LatexMathParser() {
090 super();
091 }
092
093 protected final String readToken() {
094 final String method = "readToken()";
095 Trace.begin(CLASS, this, method);
096 StringBuffer token = new StringBuffer();
097 tokenWhiteSpaceLines = 0;
098 try {
099 do {
100 tokenWhiteSpaceLines += readPureWhitespace();
101 if (tokenWhiteSpaceLines > 1) {
102 break;
103 }
104 if (eof()) {
105 if (token.length() <= 0) {
106 token = null;
107 }
108 break;
109 }
110 final int c = getChar();
111 if (Character.isDigit((char) c)) {
112 token.append((char) readChar());
113 if (Character.isDigit((char) getChar())) {
114 continue;
115 }
116 break;
117 }
118 if (SPECIALCHARACTERS.indexOf(c) >= 0) {
119 switch (c) {
120 case '&':
121 case '%':
122 case '~':
123 case '$': // TODO mime 20060504 or break in this case?
124 readChar();
125 continue;
126 case '\\':
127 final String t = readBackslashToken();
128 if (t.equals(" ") || t.equals("quad") || t.equals("qquad")) {
129 continue;
130 }
131 token.append(t);
132 if ('_' == getChar() || '^' == getChar()) {
133 token.append((char) readChar());
134 continue;
135 }
136 break;
137 case '{':
138 readChar();
139 token.append("(");
140 break;
141 case '}':
142 readChar();
143 token.append(")");
144 break;
145 default:
146 readChar();
147 token.append((char) c);
148 if ('_' == getChar() || '^' == getChar()) {
149 token.append((char) readChar());
150 continue;
151 }
152 }
153 break;
154 }
155 token.append((char) readChar());
156 if ('_' == getChar() || '^' == getChar()) {
157 token.append((char) readChar());
158 continue;
159 }
160 break;
161 /*
162 String operator = null;
163 markPosition();
164 while (!eof() && (Character.isLetterOrDigit((char) getChar()) || '_' == getChar()
165 || '^' == getChar())) {
166 token.append((char) readChar());
167 if (null != getOperator(token.toString())) {
168 operator = token.toString();
169 clearMark();
170 markPosition();
171 }
172 }
173 if (operator != null) {
174 rewindPosition();
175 token.setLength(0);
176 token.append(operator);
177 } else {
178 clearMark();
179 }
180 */
181 } while (!eof());
182 Trace.param(CLASS, this, method, "return token", token);
183 return (token != null ? token.toString() : null);
184 } finally {
185 Trace.end(CLASS, this, method);
186 }
187 }
188
189 private String readBackslashToken() {
190 final String method = "readBackslashToken()";
191 Trace.begin(CLASS, this, method);
192 if (getChar() != '\\') {
193 throw new IllegalArgumentException("\\ expected");
194 }
195 readChar(); // read \
196 if (eof()) {
197 Trace.param(CLASS, this, method, "return", null);
198 Trace.end(CLASS, this, method);
199 return null;
200 }
201 if (!Character.isLetter((char) getChar())) {
202 Trace.param(CLASS, this, method, "return", (char) getChar());
203 Trace.end(CLASS, this, method);
204 return "" + ((char) readChar());
205 }
206 final StringBuffer buffer = new StringBuffer();
207 do {
208 buffer.append((char) readChar());
209 } while (!eof() && Character.isLetter((char) getChar()));
210 Trace.param(CLASS, this, method, "return", buffer.toString());
211 Trace.end(CLASS, this, method);
212 return buffer.toString();
213 }
214
215 private int readPureWhitespace() {
216 int lines = 0;
217 while (getChar() != -1 && Character.isWhitespace((char) getChar())) {
218 if ('\n' == (char) getChar()) {
219 lines++;
220 }
221 readChar();
222 }
223 return lines;
224 }
225
226 protected final Operator getOperator(final String token) {
227 Operator result = null;
228 if (token == null) {
229 return null;
230 }
231 for (int i = 0; i < getOperators().size(); i++) {
232 if (token.equals(((Operator) getOperators().get(i)).getStartSymbol())) {
233 result = (Operator) getOperators().get(i);
234 break;
235 }
236 }
237 if (result != null) {
238 return result;
239 }
240 // mime 20080725: no operator found -> return subject variable
241 if (SPECIALCHARACTERS.indexOf(token) < 0) {
242 return new Operator(token, null, null, "VAR", token, 200, 0, 0);
243 }
244 return null;
245 }
246
247 protected final List getOperators(final String token) {
248 final List result = new ArrayList();
249 if (token == null) {
250 return result;
251 }
252 for (int i = 0; i < getOperators().size(); i++) {
253 if (token.equals(((Operator) getOperators().get(i)).getStartSymbol())) {
254 result.add(getOperators().get(i));
255 }
256 }
257 // mime 20080725: no operator found -> return subject variable
258 if (result.size() <= 0 && SPECIALCHARACTERS.indexOf(token) < 0) {
259 result.add(new Operator(token, null, null, "VAR", token, 200, 0, 0));
260 }
261 return result;
262 }
263
264 protected boolean eot(final String token) {
265 return token == null || token.trim().length() == 0;
266 }
267
268 }
|