package info.ephyra.indexing;

import info.ephyra.io.MsgPrinter;
import info.ephyra.nlp.RegExMatcher;
import info.ephyra.util.FileUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:info/ephyra/indexing/AQUAINT2Preprocessor.class */
public class AQUAINT2Preprocessor {
    private static String dir;

    private static boolean addParagraphTags() {
        for (File file : FileUtils.getFilesRec(dir)) {
            if (file.getName().contains(".")) {
                MsgPrinter.printStatusMsg("Ignoring " + file.getPath() + ".\n");
            } else {
                MsgPrinter.printStatusMsg("Parsing " + file.getName() + "...");
                ArrayList arrayList = new ArrayList();
                try {
                    BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
                    boolean z = false;
                    boolean z2 = false;
                    Pattern compile = Pattern.compile("\\s*+<DOC\\s*+id=\"([^\"]*+)\"\\s*+type=\"([^\"]*+)\"\\s*+>\\s*+");
                    String str = null;
                    String str2 = null;
                    boolean z3 = false;
                    while (bufferedReader.ready()) {
                        String readLine = bufferedReader.readLine();
                        String str3 = arrayList.size() > 0 ? (String) arrayList.get(arrayList.size() - 1) : "";
                        Matcher matcher = compile.matcher(readLine);
                        if (matcher.find()) {
                            str = matcher.group(1);
                            str2 = matcher.group(2);
                        }
                        if (z) {
                            if (z2) {
                                if (readLine.contains("</P>")) {
                                    z2 = false;
                                } else if (readLine.contains("<P>")) {
                                    arrayList.add("</P>");
                                    z3 = true;
                                } else if (readLine.matches("\\s*+<[^>]++>\\s*+")) {
                                    arrayList.add("</P>");
                                    z3 = true;
                                    z2 = false;
                                } else if (!str3.contains("<P>") && readLine.matches("\\s*+")) {
                                    arrayList.add("</P>");
                                    arrayList.add("<P>");
                                    z3 = true;
                                }
                            } else if (readLine.contains("<P>")) {
                                z2 = true;
                            } else if (readLine.contains("</P>")) {
                                arrayList.add("<P>");
                                z3 = true;
                            } else if (!readLine.matches("\\s*+<[^>]++>\\s*+")) {
                                arrayList.add("<P>");
                                z3 = true;
                                z2 = true;
                            }
                        }
                        if (readLine.contains("<TEXT>")) {
                            z = true;
                        }
                        if (readLine.contains("</TEXT>")) {
                            if (z3 && !str2.equals("multi") && !str2.equals("advis") && !str2.equals(RegExMatcher.OTHER)) {
                                MsgPrinter.printStatusMsg("Document " + str + " of type '" + str2 + "' modified.");
                            }
                            z3 = false;
                            z = false;
                        }
                        if (!readLine.matches("\\s*+")) {
                            arrayList.add(readLine);
                        }
                    }
                    bufferedReader.close();
                    try {
                        PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"));
                        Iterator it = arrayList.iterator();
                        while (it.hasNext()) {
                            printWriter.println((String) it.next());
                        }
                        printWriter.close();
                        MsgPrinter.printStatusMsg("...parsed.\n");
                    } catch (IOException e) {
                        return false;
                    }
                } catch (IOException e2) {
                    return false;
                }
            }
        }
        return true;
    }

    private static boolean convertToTrectext() {
        for (File file : FileUtils.getFilesRec(dir)) {
            if (file.getName().contains(".")) {
                MsgPrinter.printStatusMsg("Ignoring " + file.getPath() + ".\n");
            } else {
                MsgPrinter.printStatusMsg("Parsing " + file.getName() + "...");
                ArrayList arrayList = new ArrayList();
                try {
                    BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
                    Pattern compile = Pattern.compile("\\s*+<DOC\\s*+id=\"([^\"]*+)\"\\s*+type=\"([^\"]*+)\"\\s*+>\\s*+");
                    while (bufferedReader.ready()) {
                        String readLine = bufferedReader.readLine();
                        if (readLine.matches("\\s*+<\\?xml .*+") || readLine.matches("\\s*+<!DOCTYPE .*+") || readLine.matches("\\s*+<DOCSTREAM>\\s*+") || readLine.matches("\\s*+</DOCSTREAM>\\s*+")) {
                            System.out.println("Dropping line: " + readLine);
                        } else if (readLine.matches("\\s*+<HEADLINE>\\s*+")) {
                            arrayList.add("<TITLE>");
                        } else if (readLine.matches("\\s*+</HEADLINE>\\s*+")) {
                            arrayList.add("</TITLE>");
                        } else {
                            Matcher matcher = compile.matcher(readLine);
                            if (matcher.find()) {
                                String group = matcher.group(1);
                                String group2 = matcher.group(2);
                                arrayList.add("<DOC>");
                                arrayList.add("<DOCNO>" + group + "</DOCNO>");
                                arrayList.add("<DOCTYPE>" + group2 + "</DOCTYPE>");
                            } else {
                                arrayList.add(readLine);
                            }
                        }
                    }
                    bufferedReader.close();
                    try {
                        PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"));
                        Iterator it = arrayList.iterator();
                        while (it.hasNext()) {
                            printWriter.println((String) it.next());
                        }
                        printWriter.close();
                        MsgPrinter.printStatusMsg("...parsed.\n");
                    } catch (IOException e) {
                        return false;
                    }
                } catch (IOException e2) {
                    return false;
                }
            }
        }
        return true;
    }

    public static void main(String[] strArr) {
        if (strArr.length < 1) {
            MsgPrinter.printUsage("java AQUAINT2Preprocessor AQUAINT2_directory");
            System.exit(1);
        }
        dir = strArr[0];
        MsgPrinter.enableStatusMsgs(true);
        MsgPrinter.enableErrorMsgs(true);
        MsgPrinter.printStatusMsg("Adding paragraph tags:\n");
        if (addParagraphTags()) {
            MsgPrinter.printStatusMsg("Paragraph tags added successfully.\n");
        } else {
            MsgPrinter.printErrorMsg("Could not add paragraph tags.");
            System.exit(1);
        }
        MsgPrinter.printStatusMsg("Converting to 'trectext' format:\n");
        if (convertToTrectext()) {
            MsgPrinter.printStatusMsg("Documents converted successfully.");
        } else {
            MsgPrinter.printErrorMsg("Could not convert documents.");
            System.exit(1);
        }
    }
}
