-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword_stats.cpp
More file actions
274 lines (231 loc) · 6.29 KB
/
word_stats.cpp
File metadata and controls
274 lines (231 loc) · 6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#include <common.h>
#include <boost/ptr_container/ptr_vector.hpp>
#include <boost/bind.hpp>
#include <fstream>
#include <vector>
#include <map>
#include <memory>
#include <algorithm>
#include <cmath>
namespace WN = WordsNumber;
using std::string;
using std::vector;
typedef std::map<Word, uint> WordsMap;
typedef std::map<Pair, uint> PairsMap;
typedef std::vector<Pair> PairsVector;
const size_t RESULT_SIZE = 100;
struct CalcData {
CalcData()
: nWords(0)
{}
size_t nWords;
WordsMap wordsMap;
PairsMap pairsMap;
};
class ValueCalculator {
public:
virtual ~ValueCalculator()
{}
virtual double get(const Pair&) const = 0;
virtual string name() const = 0;
};
class Comparator {
public:
// uses the ref given
Comparator(const ValueCalculator& calculator)
: calculator_(calculator)
{}
bool operator() (const Pair& lhs, const Pair& rhs) const
{
return calculator_.get(lhs) > calculator_.get(rhs);
}
private:
const ValueCalculator& calculator_;
};
void getKeyPhrases(const Comparator& comparator, const PairsVector& allPairs,
size_t number, PairsVector* result)
{
result->clear();
const size_t nPairs = std::min(number, allPairs.size());
result->resize(nPairs);
std::partial_sort_copy(allPairs.begin(), allPairs.end(),
result->begin(), result->end(), comparator);
}
template<class T>
void readPhrasesMap(std::istream& is,
std::map<typename Phrase<T>::type, uint>* phrases)
{
size_t nPhrases;
is >> nPhrases;
phrases->clear();
for (size_t iPhrase = 0; iPhrase < nPhrases; ++iPhrase) {
typename Phrase<T>::type phrase;
is >> phrase;
is >> (*phrases)[phrase];
}
}
void readData(const char* fileName, CalcData* data)
{
data->wordsMap.clear();
data->pairsMap.clear();
std::ifstream fin(fileName);
if (!fin.is_open()) {
std::cerr << "Could not open file " << string(fileName) << "\n";
return;
}
std::cout << "Reading data...\n";
fin >> data->nWords;
readPhrasesMap<WN::Single>(fin, &(data->wordsMap));
readPhrasesMap<WN::Pair>(fin, &(data->pairsMap));
fin.close();
}
void processCalculator(const ValueCalculator& calculator,
const string& path, const PairsVector& allPairs)
{
std::cout << "Processing " << calculator.name() << "..." << std::endl;
PairsVector result;
getKeyPhrases(Comparator(calculator), allPairs, RESULT_SIZE, &result);
std::ofstream fout(path + calculator.name());
if (!fout.is_open()) {
std::cerr << "Could not open file " << calculator.name() << "for writing\n";
return;
}
// ostream_iterator won't work here
for (PairsVector::const_iterator it = result.begin();
it != result.end(); ++it)
{
fout << *it << "\n";
}
fout.close();
}
template<class Key, class Value>
void mapKeys2vector(const std::map<Key, Value>& map, std::vector<Key>* vector)
{
vector->clear();
vector->reserve(map.size());
for (typename std::map<Key, Value>::const_iterator it = map.begin();
it != map.end(); ++it)
{
vector->push_back(it->first);
}
}
class FrequencyCalculator : public ValueCalculator {
public:
// uses PairsMap ref given
FrequencyCalculator(const PairsMap& pairsMap)
: pairsMap_(pairsMap)
{}
double get(const Pair& pair) const
{
return static_cast<double>(pairsMap_.find(pair)->second);
}
string name() const
{
return "frequency";
}
private:
const PairsMap& pairsMap_;
};
class TStatCalculator : public ValueCalculator {
public:
// uses the CalcData ref given
TStatCalculator(const CalcData& data)
: data_(data)
{}
double get(const Pair& pair) const
{
const uint pairFreq = data_.pairsMap.find(pair)->second;
const uint firstFreq = data_.wordsMap.find(pair.first)->second;
const uint secondFreq = data_.wordsMap.find(pair.second)->second;
const double t = (pairFreq - 1.0 * firstFreq * secondFreq / data_.nWords) /
std::sqrt(1.0 * pairFreq);
return -t;
}
string name() const
{
return "t-statistic";
}
private:
const CalcData& data_;
};
template<class T>
T sqr(T x)
{
return x * x;
}
class ChiSquaredCalculator : public ValueCalculator {
public:
// uses the CalcData ref given
ChiSquaredCalculator(const CalcData& data)
: data_(data)
{}
double get(const Pair& pair) const
{
/*
word frequency can be less than pair frequency with this word
because of deleting rare phrases in text processing
*/
const int O11 = static_cast<int>(data_.pairsMap.find(pair)->second);
const int O12 = std::max(static_cast<int>(0),
static_cast<int>(data_.wordsMap.find(pair.first)->second) - O11);
const int O21 = std::max(static_cast<int>(0),
static_cast<int>(data_.wordsMap.find(pair.second)->second) - O11);
const int O22 = data_.nWords - O12 - O21 + O11;
const double chisq = 1.0 * data_.nWords * sqr(O11 * O22 - O12 * O21)
/ (O11 + O12) * (O11 + O21) * (O22 + O12) * (O22 + O21);
return -chisq;
}
string name() const
{
return "chi-squared-test";
}
private:
const CalcData& data_;
};
class LikelihoodCalculator : public ValueCalculator {
public:
// uses the CalcData ref given
LikelihoodCalculator(const CalcData& data)
: data_(data)
{}
double get(const Pair& pair) const
{
const size_t N = data_.nWords;
const uint c12 = data_.pairsMap.find(pair)->second;
const uint c1 = data_.wordsMap.find(pair.first)->second;
const uint c2 = data_.wordsMap.find(pair.second)->second;
const double p = 1.0 * c2 / N;
const double p1 = 1.0 * c12 / c1;
const double p2 = 1.0 * (c2 - c12) / (N - c1);
const double logLambda = logL(c12, c1, p) + logL(c2 - c12, N - c1, p)
- logL(c12, c1, p1) - logL(c2 - c12, N - c1, p2);
return -2.0 * logLambda;
}
string name() const
{
return "likelihood";
}
private:
static double logL(uint n, uint k, double x)
{
return std::log(x) * k + std::log(1 - x) * (n - k);
}
const CalcData& data_;
};
int main()
{
const string path = "c:\\ys\\text\\";
const string statsFileName(path + "stats");
std::auto_ptr<CalcData> data(new CalcData);
readData(statsFileName.c_str(), data.get());
std::auto_ptr<PairsVector> allPairs(new PairsVector());
mapKeys2vector(data->pairsMap, allPairs.get());
boost::ptr_vector<ValueCalculator> calculators;
calculators.push_back(new FrequencyCalculator(data->pairsMap));
calculators.push_back(new TStatCalculator(*data));
calculators.push_back(new ChiSquaredCalculator(*data));
calculators.push_back(new LikelihoodCalculator(*data));
std::for_each(calculators.begin(), calculators.end(),
boost::bind(processCalculator, _1, path, *allPairs));
return 0;
}