Skip to content

Commit ce2fcee

Browse files
authored
Merge pull request #4 from Guillemdb/refactor
Refactor
2 parents e4151aa + c52da30 commit ce2fcee

99 files changed

Lines changed: 6680 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# flake8: noqa
2+
from sourced.ml.core.algorithms.tf_idf import log_tf_log_idf
3+
from sourced.ml.core.algorithms.uast_ids_to_bag import UastIds2Bag, uast2sequence
4+
from sourced.ml.core.algorithms.uast_struct_to_bag import UastRandomWalk2Bag, UastSeq2Bag
5+
from sourced.ml.core.algorithms.uast_inttypes_to_nodes import Uast2QuantizedChildren
6+
from sourced.ml.core.algorithms.uast_inttypes_to_graphlets import Uast2GraphletBag
7+
from sourced.ml.core.algorithms.uast_to_role_id_pairs import Uast2RoleIdPairs
8+
from sourced.ml.core.algorithms.uast_id_distance import Uast2IdLineDistance, Uast2IdTreeDistance
9+
from sourced.ml.core.algorithms.uast_to_id_sequence import Uast2IdSequence
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
import numpy
2+
3+
4+
def extract_coocc_matrix(global_shape, word_indices, model):
5+
# Stage 1 - extract the tokens, map them to the global vocabulary
6+
indices = []
7+
mapped_indices = []
8+
for i, w in enumerate(model.tokens):
9+
gi = word_indices.get(w)
10+
if gi is not None:
11+
indices.append(i)
12+
mapped_indices.append(gi)
13+
indices = numpy.array(indices)
14+
mapped_indices = numpy.array(mapped_indices)
15+
# Stage 2 - sort the matched tokens by the index in the vocabulary
16+
order = numpy.argsort(mapped_indices)
17+
indices = indices[order]
18+
mapped_indices = mapped_indices[order]
19+
# Stage 3 - produce the csr_matrix with the matched tokens **only**
20+
matrix = model.matrix.tocsr()[indices][:, indices]
21+
# Stage 4 - convert this matrix to the global (ccmatrix) coordinates
22+
csr_indices = matrix.indices
23+
for i, v in enumerate(csr_indices):
24+
# Here we use the fact that indices and mapped_indices are in the same order
25+
csr_indices[i] = mapped_indices[v]
26+
csr_indptr = matrix.indptr
27+
new_indptr = [0]
28+
for i, v in enumerate(mapped_indices):
29+
prev_ptr = csr_indptr[i]
30+
ptr = csr_indptr[i + 1]
31+
32+
# Handle missing rows
33+
prev = (mapped_indices[i - 1] + 1) if i > 0 else 0
34+
for _ in range(prev, v):
35+
new_indptr.append(prev_ptr)
36+
37+
new_indptr.append(ptr)
38+
for _ in range(mapped_indices[-1] + 1, global_shape[0]):
39+
new_indptr.append(csr_indptr[-1])
40+
matrix.indptr = numpy.array(new_indptr)
41+
matrix._shape = global_shape
42+
return matrix
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# Neural Identifier Splitter
2+
Article [Splitting source code identifiers using Bidirectional LSTM Recurrent Neural Network](https://arxiv.org/abs/1805.11651).
3+
4+
### Agenda
5+
* Data
6+
* Training pipeline
7+
* How to launch
8+
9+
### Data
10+
You can download the dataset [here](https://drive.google.com/open?id=1wZR5zF1GL1fVcA1gZuAN_9rSLd5ssqKV). More information about the dataset is available [here](https://github.com/src-d/datasets/tree/master/Identifiers).
11+
#### Data format
12+
* format of file: `.csv.gz`.
13+
* the `csv` structure:
14+
15+
|num_files|num_occ|num_repos|token|token_split|
16+
|:--|:--|:--|:--|:--|
17+
|1|2|1|quesesSet|queses set|
18+
|...|...|...|...|...|
19+
20+
#### Data stats
21+
* 49 millions of identifiers
22+
* 1 GB
23+
24+
### Training pipeline
25+
Training pipeline consists of several steps
26+
* [prepare features](https://github.com/src-d/ml/blob/master/sourced/ml/algorithms/id_splitter/features.py#L44-#L118) - read data, extract features, train/test split
27+
* [prepare generators for keras](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L34-#L48)
28+
* [prepare model - RNN or CNN](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L53-#L76)
29+
* [training](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L78-#L89)
30+
* [quality report and save the model](https://github.com/src-d/ml/blob/master/sourced/ml/cmd/train_id_split.py#L91-#L96)
31+
32+
### How to launch
33+
First of all you need to download data using link above.
34+
35+
Usage:
36+
```console
37+
usage: srcml train-id-split [-h] -i INPUT [-e EPOCHS] [-b BATCH_SIZE]
38+
[-l LENGTH] -o OUTPUT [-t TEST_RATIO]
39+
[-p {pre,post}] [--optimizer {RMSprop,Adam}]
40+
[--lr LR] [--final-lr FINAL_LR]
41+
[--samples-before-report SAMPLES_BEFORE_REPORT]
42+
[--val-batch-size VAL_BATCH_SIZE] [--seed SEED]
43+
[--devices DEVICES]
44+
[--csv-identifier CSV_IDENTIFIER]
45+
[--csv-identifier-split CSV_IDENTIFIER_SPLIT]
46+
[--include-csv-header] --model {RNN,CNN}
47+
[-s STACK]
48+
[--type-cell {GRU,LSTM,CuDNNLSTM,CuDNNGRU}]
49+
[-n NEURONS] [-f FILTERS] [-k KERNEL_SIZES]
50+
[--dim-reduction DIM_REDUCTION]
51+
52+
optional arguments:
53+
-h, --help show this help message and exit
54+
-i INPUT, --input INPUT
55+
Path to the input data in CSV
56+
format:num_files,num_occ,num_repos,token,token_split
57+
-e EPOCHS, --epochs EPOCHS
58+
Number of training epochs. The more the betterbut the
59+
training time is proportional. (default: 10)
60+
-b BATCH_SIZE, --batch-size BATCH_SIZE
61+
Batch size. Higher values better utilize GPUsbut may
62+
harm the convergence. (default: 500)
63+
-l LENGTH, --length LENGTH
64+
RNN sequence length. (default: 40)
65+
-o OUTPUT, --output OUTPUT
66+
Path to store the trained model.
67+
-t TEST_RATIO, --test-ratio TEST_RATIO
68+
Fraction of the dataset to use for evaluation.
69+
(default: 0.2)
70+
-p {pre,post}, --padding {pre,post}
71+
Whether to pad before or after each sequence.
72+
(default: post)
73+
--optimizer {RMSprop,Adam}
74+
Algorithm to use as an optimizer for the neural net.
75+
(default: Adam)
76+
--lr LR Initial learning rate. (default: 0.001)
77+
--final-lr FINAL_LR Final learning rate. The decrease from the initial
78+
learning rate is done linearly. (default: 1e-05)
79+
--samples-before-report SAMPLES_BEFORE_REPORT
80+
Number of samples between each validation reportand
81+
training updates. (default: 5000000)
82+
--val-batch-size VAL_BATCH_SIZE
83+
Batch size for validation.It can be increased to speed
84+
up the pipeline butit proportionally increases the
85+
memory consumption. (default: 2000)
86+
--seed SEED Random seed. (default: 1989)
87+
--devices DEVICES Device(s) to use. '-1' means CPU. (default: 0)
88+
--csv-identifier CSV_IDENTIFIER
89+
Column name in the CSV file for the raw identifier.
90+
(default: 3)
91+
--csv-identifier-split CSV_IDENTIFIER_SPLIT
92+
Column name in the CSV file for the splitidentifier.
93+
(default: 4)
94+
--include-csv-header Treat the first line of the input CSV as a
95+
regularline. (default: False)
96+
--model {RNN,CNN} Neural Network model to use to learn the
97+
identifiersplitting task.
98+
-s STACK, --stack STACK
99+
Number of layers stacked on each other. (default: 2)
100+
--type-cell {GRU,LSTM,CuDNNLSTM,CuDNNGRU}
101+
Recurrent layer type to use. (default: LSTM)
102+
-n NEURONS, --neurons NEURONS
103+
Number of neurons on each layer. (default: 256)
104+
-f FILTERS, --filters FILTERS
105+
Number of filters for each kernel size. (default:
106+
64,32,16,8)
107+
-k KERNEL_SIZES, --kernel-sizes KERNEL_SIZES
108+
Sizes for sliding windows. (default: 2,4,8,16)
109+
--dim-reduction DIM_REDUCTION
110+
Number of 1-d kernels to reduce dimensionalityafter
111+
each layer. (default: 32)
112+
```
113+
114+
115+
Examples of commands:
116+
1) Train RNN with LSTM cells
117+
```console
118+
srcml train-id-split --model RNN --input /path/to/input.csv.gz --output /path/to/output
119+
```
120+
2) Train RNN with CuDNNLSTM cells
121+
```console
122+
srcml train-id-split --model RNN --input /path/to/input.csv.gz --output /path/to/output \
123+
--type-cell CuDNNLSTM
124+
```
125+
3) Train CNN
126+
```console
127+
srcml train-id-split --model CNN --input /path/to/input.csv.gz --output /path/to/output
128+
```

sourced/ml/core/algorithms/id_splitter/__init__.py

Whitespace-only changes.
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import logging
2+
import string
3+
import tarfile
4+
from typing import List, Tuple
5+
6+
from modelforge.progress_bar import progress_bar
7+
import numpy
8+
9+
10+
def read_identifiers(csv_path: str, use_header: bool, max_identifier_len: int, identifier_col: int,
11+
split_identifier_col: int, shuffle: bool = True) -> List[str]:
12+
"""
13+
Reads and filters too long identifiers in the CSV file.
14+
15+
:param csv_path: path to the CSV file.
16+
:param use_header: uses header as normal line (True) or treat as header line with column names.
17+
:param max_identifier_len: maximum length of raw identifiers. Skip identifiers that are longer.
18+
:param identifier_col: column name in the CSV file for the raw identifier.
19+
:param split_identifier_col: column name in the CSV file for the split identifier lowercase.
20+
:param shuffle: indicates whether to reorder the list of identifiers
21+
at random after reading it.
22+
:return: list of split identifiers.
23+
"""
24+
log = logging.getLogger("read_identifiers")
25+
log.info("Reading data from the CSV file %s", csv_path)
26+
identifiers = []
27+
# TODO: Update dataset loading as soon as https://github.com/src-d/backlog/issues/1212 done
28+
# Think about dataset download step
29+
with tarfile.open(csv_path, encoding="utf-8") as f:
30+
assert len(f.members) == 1, "One archived file is expected, got: %s" % len(f.members)
31+
content = f.extractfile(f.members[0])
32+
if not use_header:
33+
content.readline()
34+
for line in progress_bar(content.readlines(), log):
35+
row = line.decode("utf-8").strip().split(",")
36+
if len(row[identifier_col]) <= max_identifier_len:
37+
identifiers.append(row[split_identifier_col])
38+
if shuffle:
39+
numpy.random.shuffle(identifiers)
40+
log.info("Number of identifiers after filtering: %s." % len(identifiers))
41+
return identifiers
42+
43+
44+
def prepare_features(csv_path: str, use_header: bool, max_identifier_len: int,
45+
identifier_col: int, split_identifier_col: int, test_ratio: float,
46+
padding: str, shuffle: bool = True) -> Tuple[numpy.array]:
47+
"""
48+
Prepare the features to train the identifier splitting task.
49+
50+
:param csv_path: path to the CSV file.
51+
:param use_header: uses header as normal line (True) or treat as header line with column names.
52+
:param max_identifier_len: maximum length of raw identifiers. Skip identifiers that are longer.
53+
:param identifier_col: column in the CSV file for the raw identifier.
54+
:param split_identifier_col: column in the CSV file for the split identifier.
55+
:param shuffle: indicates whether to reorder the list of identifiers
56+
at random after reading it.
57+
:param test_ratio: Proportion of test samples used for evaluation.
58+
:param padding: position where to add padding values:
59+
after the intput sequence if "post", before if "pre".
60+
:return: training and testing features to train the neural net for the splitting task.
61+
"""
62+
from keras.preprocessing.sequence import pad_sequences
63+
log = logging.getLogger("prepare_features")
64+
65+
# read data from the input file
66+
identifiers = read_identifiers(csv_path=csv_path, use_header=use_header,
67+
max_identifier_len=max_identifier_len,
68+
identifier_col=identifier_col,
69+
split_identifier_col=split_identifier_col, shuffle=shuffle)
70+
71+
log.info("Converting identifiers to character indices")
72+
log.info("Number of identifiers: %d, Average length: %d characters" %
73+
(len(identifiers), numpy.mean([len(i) for i in identifiers])))
74+
75+
char2ind = {c: i + 1 for i, c in enumerate(sorted(string.ascii_lowercase))}
76+
77+
char_id_seq = []
78+
splits = []
79+
for identifier in identifiers:
80+
# iterate through the identifier and convert to array of char indices & boolean split array
81+
index_arr = []
82+
split_arr = []
83+
skip_char = False
84+
for char in identifier.strip():
85+
if char in char2ind:
86+
index_arr.append(char2ind[char])
87+
if skip_char:
88+
skip_char = False
89+
continue
90+
split_arr.append(0)
91+
elif char == " ":
92+
split_arr.append(1)
93+
skip_char = True
94+
else:
95+
log.warning("Unexpected symbol %s in identifier", char)
96+
assert len(index_arr) == len(split_arr)
97+
char_id_seq.append(index_arr)
98+
splits.append(split_arr)
99+
100+
log.info("Number of subtokens: %d, Number of distinct characters: %d" %
101+
(sum(sum(split_arr) for split_arr in splits) + len(identifiers),
102+
len({i for index_arr in char_id_seq for i in index_arr})))
103+
104+
log.info("Train/test splitting...")
105+
n_train = int((1 - test_ratio) * len(char_id_seq))
106+
X_train = char_id_seq[:n_train]
107+
X_test = char_id_seq[n_train:]
108+
y_train = splits[:n_train]
109+
y_test = splits[n_train:]
110+
log.info("Number of train samples: %s, number of test samples: %s" % (len(X_train),
111+
len(X_test)))
112+
log.info("Padding the sequences...")
113+
X_train = pad_sequences(X_train, maxlen=max_identifier_len, padding=padding)
114+
X_test = pad_sequences(X_test, maxlen=max_identifier_len, padding=padding)
115+
y_train = pad_sequences(y_train, maxlen=max_identifier_len, padding=padding)
116+
y_test = pad_sequences(y_test, maxlen=max_identifier_len, padding=padding)
117+
118+
return X_train, X_test, y_train[:, :, None], y_test[:, :, None]

0 commit comments

Comments
 (0)