| 1 | /* This file is part of the project "Hilbert II" - http://www.qedeq.org |
| 2 | * |
| 3 | * Copyright 2000-2014, Michael Meyling <mime@qedeq.org>. |
| 4 | * |
| 5 | * "Hilbert II" is free software; you can redistribute |
| 6 | * it and/or modify it under the terms of the GNU General Public |
| 7 | * License as published by the Free Software Foundation; either |
| 8 | * version 2 of the License, or (at your option) any later version. |
| 9 | * |
| 10 | * This program is distributed in the hope that it will be useful, |
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 13 | * GNU General Public License for more details. |
| 14 | */ |
| 15 | |
| 16 | package org.qedeq.kernel.bo.parser; |
| 17 | |
| 18 | import java.util.ArrayList; |
| 19 | import java.util.List; |
| 20 | |
| 21 | import org.qedeq.base.trace.Trace; |
| 22 | |
| 23 | /* |
| 24 | * TODO mime 20080118: refactor |
| 25 | * |
| 26 | * Whitespace LaTeX form, could be eaten |
| 27 | * \t |
| 28 | * \r |
| 29 | * \n |
| 30 | * \\ |
| 31 | * \\, |
| 32 | * & |
| 33 | * \\\\ |
| 34 | * \\par |
| 35 | * \\quad |
| 36 | * \\qquad |
| 37 | * |
| 38 | * Separator only one allowed, before and after only whitespace is possible |
| 39 | * , |
| 40 | * | |
| 41 | * $$ |
| 42 | * Separator should be read as tokens. |
| 43 | * |
| 44 | * Problem: If some atom like is followed by "(" it should be taken as an |
| 45 | * (function) operator. But if we start with readToken we don't see the "(" |
| 46 | * character. |
| 47 | * |
| 48 | * Problem: Could whitespace be recognized? |
| 49 | * Translating whitespace tokens into spaces is not easy, one has to know the |
| 50 | * end of the whitespace token. |
| 51 | * Possible solution: |
| 52 | * function read token (LaTeX specific) |
| 53 | * skip real whitespace (" ", "\t", "\r", "\n") |
| 54 | * read char |
| 55 | * case char |
| 56 | * "\\" read characters or numbers (check LaTeX Syntax) |
| 57 | * "{", "}", "(", ")" are also allowed |
| 58 | * resulting string is token |
| 59 | * LaTeX command definition modifies above: |
| 60 | * Die meisten LATEX-Befehle haben eines der beiden folgenden Formate: Entweder sie beginnen |
| 61 | * mit einem Backslash (\) und haben dann einen nur aus Buchstaben bestehenden Namen, der durch |
| 62 | * ein oder mehrere Leerzeichen oder durch ein nachfolgendes Sonderzeichen oder eine Ziffer beendet |
| 63 | * wird; oder sie bestehen aus einem Backslash und genau einem Sonderzeichen oder einer Ziffer. |
| 64 | * Gross- und Kleinbuchstaben haben auch in Befehlsnamen verschiedene Bedeutung. Wenn man nach |
| 65 | * einem Befehlsnamen eine Leerstelle erhalten will, muss man "{}" zur Beendigung des Befehlsnamens |
| 66 | * oder einen eigenen Befehl f\u00fcr die Leerstelle verwenden. |
| 67 | */ |
| 68 | |
| 69 | /** |
| 70 | * Parse LaTeX term or formula data into {@link org.qedeq.kernel.bo.parser.Term}s. |
| 71 | * |
| 72 | * @author Michael Meyling |
| 73 | */ |
| 74 | public class LatexMathParser extends MathParser { |
| 75 | |
| 76 | /** This class. */ |
| 77 | private static final Class CLASS = LatexMathParser.class; |
| 78 | |
| 79 | /** Characters with special LaTeX meaning. */ |
| 80 | private static final String SPECIALCHARACTERS = "(),{}\\~%$&"; |
| 81 | |
| 82 | /** Counter for token whitespace lines. */ |
| 83 | private int tokenWhiteSpaceLines; |
| 84 | |
| 85 | /** |
| 86 | * Constructor. |
| 87 | * |
| 88 | */ |
| 89 | public LatexMathParser() { |
| 90 | super(); |
| 91 | } |
| 92 | |
| 93 | protected final String readToken() { |
| 94 | final String method = "readToken()"; |
| 95 | Trace.begin(CLASS, this, method); |
| 96 | StringBuffer token = new StringBuffer(); |
| 97 | tokenWhiteSpaceLines = 0; |
| 98 | try { |
| 99 | do { |
| 100 | tokenWhiteSpaceLines += readPureWhitespace(); |
| 101 | if (tokenWhiteSpaceLines > 1) { |
| 102 | break; |
| 103 | } |
| 104 | if (eof()) { |
| 105 | if (token.length() <= 0) { |
| 106 | token = null; |
| 107 | } |
| 108 | break; |
| 109 | } |
| 110 | final int c = getChar(); |
| 111 | if (Character.isDigit((char) c)) { |
| 112 | token.append((char) readChar()); |
| 113 | if (Character.isDigit((char) getChar())) { |
| 114 | continue; |
| 115 | } |
| 116 | break; |
| 117 | } |
| 118 | if (SPECIALCHARACTERS.indexOf(c) >= 0) { |
| 119 | switch (c) { |
| 120 | case '&': |
| 121 | case '%': |
| 122 | case '~': |
| 123 | case '$': // TODO mime 20060504 or break in this case? |
| 124 | readChar(); |
| 125 | continue; |
| 126 | case '\\': |
| 127 | final String t = readBackslashToken(); |
| 128 | if (t.equals(" ") || t.equals("quad") || t.equals("qquad")) { |
| 129 | continue; |
| 130 | } |
| 131 | token.append(t); |
| 132 | if ('_' == getChar() || '^' == getChar()) { |
| 133 | token.append((char) readChar()); |
| 134 | continue; |
| 135 | } |
| 136 | break; |
| 137 | case '{': |
| 138 | readChar(); |
| 139 | token.append("("); |
| 140 | break; |
| 141 | case '}': |
| 142 | readChar(); |
| 143 | token.append(")"); |
| 144 | break; |
| 145 | default: |
| 146 | readChar(); |
| 147 | token.append((char) c); |
| 148 | if ('_' == getChar() || '^' == getChar()) { |
| 149 | token.append((char) readChar()); |
| 150 | continue; |
| 151 | } |
| 152 | } |
| 153 | break; |
| 154 | } |
| 155 | token.append((char) readChar()); |
| 156 | if ('_' == getChar() || '^' == getChar()) { |
| 157 | token.append((char) readChar()); |
| 158 | continue; |
| 159 | } |
| 160 | break; |
| 161 | /* |
| 162 | String operator = null; |
| 163 | markPosition(); |
| 164 | while (!eof() && (Character.isLetterOrDigit((char) getChar()) || '_' == getChar() |
| 165 | || '^' == getChar())) { |
| 166 | token.append((char) readChar()); |
| 167 | if (null != getOperator(token.toString())) { |
| 168 | operator = token.toString(); |
| 169 | clearMark(); |
| 170 | markPosition(); |
| 171 | } |
| 172 | } |
| 173 | if (operator != null) { |
| 174 | rewindPosition(); |
| 175 | token.setLength(0); |
| 176 | token.append(operator); |
| 177 | } else { |
| 178 | clearMark(); |
| 179 | } |
| 180 | */ |
| 181 | } while (!eof()); |
| 182 | Trace.param(CLASS, this, method, "return token", token); |
| 183 | return (token != null ? token.toString() : null); |
| 184 | } finally { |
| 185 | Trace.end(CLASS, this, method); |
| 186 | } |
| 187 | } |
| 188 | |
| 189 | private String readBackslashToken() { |
| 190 | final String method = "readBackslashToken()"; |
| 191 | Trace.begin(CLASS, this, method); |
| 192 | if (getChar() != '\\') { |
| 193 | throw new IllegalArgumentException("\\ expected"); |
| 194 | } |
| 195 | readChar(); // read \ |
| 196 | if (eof()) { |
| 197 | Trace.param(CLASS, this, method, "return", null); |
| 198 | Trace.end(CLASS, this, method); |
| 199 | return null; |
| 200 | } |
| 201 | if (!Character.isLetter((char) getChar())) { |
| 202 | Trace.param(CLASS, this, method, "return", (char) getChar()); |
| 203 | Trace.end(CLASS, this, method); |
| 204 | return "" + ((char) readChar()); |
| 205 | } |
| 206 | final StringBuffer buffer = new StringBuffer(); |
| 207 | do { |
| 208 | buffer.append((char) readChar()); |
| 209 | } while (!eof() && Character.isLetter((char) getChar())); |
| 210 | Trace.param(CLASS, this, method, "return", buffer.toString()); |
| 211 | Trace.end(CLASS, this, method); |
| 212 | return buffer.toString(); |
| 213 | } |
| 214 | |
| 215 | private int readPureWhitespace() { |
| 216 | int lines = 0; |
| 217 | while (getChar() != -1 && Character.isWhitespace((char) getChar())) { |
| 218 | if ('\n' == (char) getChar()) { |
| 219 | lines++; |
| 220 | } |
| 221 | readChar(); |
| 222 | } |
| 223 | return lines; |
| 224 | } |
| 225 | |
| 226 | protected final Operator getOperator(final String token) { |
| 227 | Operator result = null; |
| 228 | if (token == null) { |
| 229 | return null; |
| 230 | } |
| 231 | for (int i = 0; i < getOperators().size(); i++) { |
| 232 | if (token.equals(((Operator) getOperators().get(i)).getStartSymbol())) { |
| 233 | result = (Operator) getOperators().get(i); |
| 234 | break; |
| 235 | } |
| 236 | } |
| 237 | if (result != null) { |
| 238 | return result; |
| 239 | } |
| 240 | // mime 20080725: no operator found -> return subject variable |
| 241 | if (SPECIALCHARACTERS.indexOf(token) < 0) { |
| 242 | return new Operator(token, null, null, "VAR", token, 200, 0, 0); |
| 243 | } |
| 244 | return null; |
| 245 | } |
| 246 | |
| 247 | protected final List getOperators(final String token) { |
| 248 | final List result = new ArrayList(); |
| 249 | if (token == null) { |
| 250 | return result; |
| 251 | } |
| 252 | for (int i = 0; i < getOperators().size(); i++) { |
| 253 | if (token.equals(((Operator) getOperators().get(i)).getStartSymbol())) { |
| 254 | result.add(getOperators().get(i)); |
| 255 | } |
| 256 | } |
| 257 | // mime 20080725: no operator found -> return subject variable |
| 258 | if (result.size() <= 0 && SPECIALCHARACTERS.indexOf(token) < 0) { |
| 259 | result.add(new Operator(token, null, null, "VAR", token, 200, 0, 0)); |
| 260 | } |
| 261 | return result; |
| 262 | } |
| 263 | |
| 264 | protected boolean eot(final String token) { |
| 265 | return token == null || token.trim().length() == 0; |
| 266 | } |
| 267 | |
| 268 | } |