Bachelor Project - Relation Extraction
Extracing location based relations out of geonames data
 All Classes
GeoReader.h
1 // Copyright 2012, University of Freiburg
2 // Anton Stepan <stepana@informatik.uni-freiburg.de>
3 // Marius Bethge <marius.bethge@gmail.com>
4 
5 #ifndef TRUNK_GEONAMES_GEOREADER_V2_GEOREADER_H_
6 #define TRUNK_GEONAMES_GEOREADER_V2_GEOREADER_H_
7 
8 #include <gtest/gtest.h>
9 #include <assert.h>
10 #include <chrono>
11 #include <unordered_map>
12 #include <getopt.h>
13 #include <algorithm>
14 #include <fstream>
15 #include <sstream>
16 #include <iostream>
17 #include <string>
18 #include <vector>
19 #include "./Location.h"
20 #include "./Country.h"
21 
22 using std::string;
23 
24 class GeoReader {
25  public:
26  GeoReader();
27 
28  void setStandardParameters();
29  // Prints Usageinfo to error stream.
30  void printUsage();
31  void printError(const string& errorMessage);
32  // Parses commandline arguments. Returns true on success.
33  bool parseCommandlineArguments(int argc, char** argv);
34  // Read countryInfo-file/asciiAdmin1-file/admin2-file and fill the
35  // "_<FeatureType>" and "_<FeatureType>Lookup" maps, respectively.
36  void readCountryFile();
37  FRIEND_TEST(GeoReaderTest, readCountryFile);
38  void readAdmin1File();
39  FRIEND_TEST(GeoReaderTest, readAdmin1File);
40  void readAdmin2File();
41  FRIEND_TEST(GeoReaderTest, readAdmin2File);
42  // Reads and processes the allCountries file. Thereby complementing the
43  // _cities, _admin1s and _admin2s maps.
44  void readAllCountries();
45  FRIEND_TEST(GeoReaderTest, readAllCountries);
46  // Marks those locations, that have the same name as other entities in the
47  // ontology, e.g. the city Homer and the pilosopher Homer. Also marks
48  // locations, that have the same name as countries.
49  void markReservedNames();
50  FRIEND_TEST(GeoReaderTest, markReservedNames);
51  // Adds suffixes to locations as needed.
52  void generateSuffixes();
53  FRIEND_TEST(GeoReaderTest, generateSuffixes);
54 
55  // Generate files
56  void generateIsA(const std::string& filename);
57  void generateLocatedIn(const std::string& filename);
58  void generateHasPopulation(const std::string& filename);
59  void generateHasLongitude(const std::string& filename);
60  void generateHasLatitude(const std::string& filename);
61  void generateHasCapital(const std::string& filename);
62 
63  // Getter
64  string getAllCountriesFileName() const { return _allCountriesFileName; }
65  string getNoLocationFileName() const { return _noLocationFileName; }
66  string getCountryInfoFileName() const { return _countryInfoFileName; }
67  string getAdmin1FileName() const { return _admin1FileName; }
68  string getAdmin2FileName() const { return _admin2FileName; }
69  string getOutputDir() const { return _outputDir; }
70 
71  private:
72  // map<"countryID", "geonamesID">
73  std::unordered_map<string, int> _countryLookup;
74  // map<"admin1ID", "geonamesID">
75  std::unordered_map<string, int> _admin1Lookup;
76  // map<"admin2ID", "geonamesID">
77  std::unordered_map<string, int> _admin2Lookup;
78 
79  // map<"geonamesID", Country>
80  std::unordered_map<int, Country> _countries;
81  // map<"geonamesID", Location>
82  std::unordered_map<int, Location> _admin1s;
83  std::unordered_map<int, Location> _admin2s;
84  std::unordered_map<int, Location> _cities;
85 
86  // List of pointers to capitals stored in _cities.
87  std::vector<Location*> _capitals;
88 
89  // Map of all the names, that shall not be used by locations
90  // because other entities already use them.
91  std::unordered_map<string, bool> _blackList;
92  // creating file (delets existing) with content from stream.
93  void addToFile(const string& fileName, const std::ostringstream& text) const;
94  // For a list of locations with the same name: gives suffixes as necessary to
95  // the locations making them uniquely identifiable.
96  void suffixGenerator(std::vector<Location*> duplicates);
97  // Generates and returns suffix-string for given Location + FeatureType
98  // combination. Does not change modify the Location.
99  const string sufString(const Location* ptr, const FeatureType f) const;
100  const string sufString(const Location* ptr, const FeatureType f,
101  const int& i) const;
102  // helper
103  const bool isBListed(const string& s) const;
104  std::string _generateLatitude(Location* loc);
105  std::string _generateLongitude(Location* loc);
106  std::string _generatePopulation(Location* loc);
107  std::string _generatePopulation(Country* loc);
108  // Options
109  string _allCountriesFileName = "";
110  string _noLocationFileName = "";
111  string _countryInfoFileName = "";
112  string _admin1FileName = "";
113  string _admin2FileName = "";
114  string _outputDir = "./";
115  std::vector<std::string> _locationClasses;
116  int _populationLimit = 999;
117 };
118 
119 #endif // TRUNK_GEONAMES_GEOREADER_V2_GEOREADER_H_