-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathCorpusCreator.vb
More file actions
3117 lines (2493 loc) · 118 KB
/
CorpusCreator.vb
File metadata and controls
3117 lines (2493 loc) · 118 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
Imports System.IO
Imports System.Text.RegularExpressions
Imports System.Web.Script.Serialization
Imports CorpusModelling.Models
Imports CorpusModelling.Utilitys
Imports Newtonsoft.Json
Namespace Examples
Public Module ExampleModels
Public Sub ExampleCalculateSentenceSimilarity()
' Example sentences
Dim sentence1 As String = "The cat is on the mat."
Dim sentence2 As String = "The mat has a cat."
' Tokenize the sentences
Dim tokens1 As String() = sentence1.Split(" "c)
Dim tokens2 As String() = sentence2.Split(" "c)
' Calculate the word overlap
Dim overlap As Integer = CalculateWordOverlap(tokens1, tokens2)
' Determine entailment based on overlap
Dim entailment As Boolean = DetermineEntailment(overlap)
' Display the results
Console.WriteLine("Sentence 1: " & sentence1)
Console.WriteLine("Sentence 2: " & sentence2)
Console.WriteLine("Word Overlap: " & overlap)
Console.WriteLine("Entailment: " & entailment)
Console.ReadLine()
End Sub
' Usage Example:
Public Sub ExampleCorpusCatagorizer()
' Create an instance of the CorpusCategorizer
Dim categorizer As New CorpusCategorizer()
' Add categories and their associated keywords
categorizer.AddCategory("Sports", New List(Of String) From {"football", "basketball", "tennis"})
categorizer.AddCategory("Technology", New List(Of String) From {"computer", "software", "internet"})
categorizer.AddCategory("Politics", New List(Of String) From {"government", "election", "policy"})
' Assuming you have a corpus with multiple documents
Dim corpus As New List(Of String)()
corpus.Add("I love playing basketball and football.")
corpus.Add("Software engineering is my passion.")
corpus.Add("The government announced new policies.")
' Categorize each document in the corpus
For Each document As String In corpus
Dim categories As List(Of String) = categorizer.CategorizeDocument(document.ToLower)
Console.WriteLine("Categories for document: " & document)
For Each category As String In categories
Console.WriteLine("- " & category)
Next
Console.WriteLine()
Next
Console.ReadLine()
' Rest of your code...
End Sub
Public Sub ExampleCorpusCreator()
' Load and preprocess your text data
Dim rawData As New List(Of String) ' Load your raw text data
Dim processedData As New List(Of String) ' Preprocess your data using existing methods
' Generate batches of training data
Dim batch_size As Integer = 32
Dim seq_length As Integer = 50
Dim batches As List(Of Tuple(Of List(Of String), List(Of String))) = CorpusCreator.GenerateTransformerBatches(processedData, batch_size, seq_length)
' Iterate through batches during training
For Each batch As Tuple(Of List(Of String), List(Of String)) In batches
Dim inputSequences As List(Of String) = batch.Item1
Dim targetSequences As List(Of String) = batch.Item2
' Perform further processing, tokenization, and padding if needed
' Feed the batches to your transformer model for training
Next
End Sub
Public Sub ExampleCreateFrequencyVocabularyDictionary()
Dim frequencyVocabulary As New Dictionary(Of String, Integer)()
' Populate the frequencyVocabulary dictionary with word frequencies
Dim outputFilePath As String = "frequency_vocabulary.txt"
VocabularyGenerator.ExportFrequencyVocabularyToFile(frequencyVocabulary, outputFilePath)
Console.WriteLine($"Frequency vocabulary exported to: {outputFilePath}")
End Sub
Public Sub ExampleCreateFrequencyVocabularyFromData()
Dim textChunks As New List(Of String)()
' Populate the textChunks list with your text data
Dim frequencyVocabulary As Dictionary(Of String, Integer) = VocabularyGenerator.CreateFrequencyVocabulary(textChunks)
' Print the frequency vocabulary
For Each kvp As KeyValuePair(Of String, Integer) In frequencyVocabulary
Console.WriteLine($"Word: {kvp.Key}, Frequency: {kvp.Value}")
Next
End Sub
Public Sub ExampleLoadFrequencyVocabularyDictionary()
Dim inputFilePath As String = "frequency_vocabulary.txt"
Dim importedVocabulary As Dictionary(Of String, Integer) = VocabularyGenerator.ImportFrequencyVocabularyFromFile(inputFilePath)
' Use the importedVocabulary dictionary for further processing or analysis
For Each kvp As KeyValuePair(Of String, Integer) In importedVocabulary
Console.WriteLine($"Word: {kvp.Key}, Frequency: {kvp.Value}")
Next
End Sub
Public Sub ExampleLoadPunctuationDictionary()
Dim inputFilePath As String = "punctuation_vocabulary.txt"
Dim importedPunctuationVocabulary As HashSet(Of String) = VocabularyGenerator.ImportVocabularyFromFile(inputFilePath)
' Use the importedPunctuationVocabulary HashSet for further processing or analysis
For Each symbol As String In importedPunctuationVocabulary
Console.WriteLine($"Punctuation Symbol: {symbol}")
Next
End Sub
' Usage Example:
Public Sub ExampleModelCorpusReader()
' Assuming you have a corpus directory with tagged files and a wordlist file
Dim corpusRootPath As String = "path/to/corpus"
Dim wordlistFilePath As String = "path/to/wordlist.txt"
' Create an instance of the ModelCorpusReader
Dim corpusReader As New ModelCorpusReader(corpusRootPath)
' Add categories and their associated keywords
corpusReader.AddCategory("Sports", New List(Of String) From {"football", "basketball", "tennis"})
corpusReader.AddCategory("Technology", New List(Of String) From {"computer", "software", "internet"})
corpusReader.AddCategory("Politics", New List(Of String) From {"government", "election", "policy"})
' Retrieve tagged sentences from the corpus
Dim taggedSentences As List(Of List(Of Tuple(Of String, String))) = corpusReader.TaggedSentences()
' Print the tagged sentences
For Each sentence As List(Of Tuple(Of String, String)) In taggedSentences
For Each wordTag As Tuple(Of String, String) In sentence
Console.WriteLine("Word: " & wordTag.Item1 & ", Tag: " & wordTag.Item2)
Next
Console.WriteLine()
Next
' Retrieve words from the wordlist file
Dim wordList As List(Of String) = corpusReader.GetWordsFromWordList(wordlistFilePath)
' Print the words
For Each word As String In wordList
Console.WriteLine(word)
Next
' Assuming you have a document for categorization
Dim document As String = "I love playing basketball and football."
' Categorize the document
Dim categories As List(Of String) = corpusReader.CategorizeDocument(document)
' Print the categories
For Each category As String In categories
Console.WriteLine(category)
Next
' Rest of your code...
End Sub
' Usage Example:
Public Sub ExampleRegexFilter()
Dim regexFilter As New RegexFilter()
' Example data and patterns
Dim data As New List(Of String)()
data.Add("This is a sample sentence.")
data.Add("1234567890")
data.Add("Let's remove @special characters!")
Dim patterns As New List(Of String)()
patterns.Add("[0-9]+")
patterns.Add("@\w+")
' Filter data using regex patterns
Dim filteredData As List(Of String) = regexFilter.FilterUsingRegexPatterns(data, patterns)
' Print filtered data
For Each chunk As String In filteredData
Console.WriteLine(chunk)
Next
' Rest of your code...
End Sub
' Usage Example:
Public Sub ExampleTaggedCorpusReader()
' Assuming you have a corpus directory with tagged files
Dim corpusRootPath As String = "path/to/corpus"
' Create an instance of the TaggedCorpusReader
Dim corpusReader As New TaggedCorpusReader(corpusRootPath)
' Retrieve tagged sentences from the corpus
Dim taggedSentences As List(Of List(Of Tuple(Of String, String))) = corpusReader.TaggedSentences()
' Print the tagged sentences
For Each sentence As List(Of Tuple(Of String, String)) In taggedSentences
For Each wordTag As Tuple(Of String, String) In sentence
Console.WriteLine("Word: " & wordTag.Item1 & ", Tag: " & wordTag.Item2)
Next
Console.WriteLine()
Next
' Rest of your code...
End Sub
' Usage Example:
Public Sub ExampleTextCorpusChunker()
' Assuming you have input data and a wordlist file
Dim inputData As String = "This is a sample sentence. Another sentence follows."
Dim wordlistFilePath As String = "path/to/wordlist.txt"
' Create an instance of the TextCorpusChunker
Dim chunker As New TextCorpusChunker(ChunkType.Sentence, 0)
' Load entity list if needed
chunker.LoadEntityListFromFile("path/to/entitylist.txt")
' Process and filter text data
Dim processedData As List(Of String) = chunker.ProcessTextData(inputData, useFiltering:=True)
' Generate classification dataset
Dim classes As New List(Of String) From {"Class1", "Class2", "Class3"}
Dim classificationDataset As List(Of Tuple(Of String, String)) = chunker.GenerateClassificationDataset(processedData, classes)
' Generate predictive dataset
Dim windowSize As Integer = 3
Dim predictiveDataset As List(Of String()) = chunker.GeneratePredictiveDataset(processedData, windowSize)
' Rest of your code...
End Sub
' Usage Example:
Public Sub ExampleVocabularyGenerator()
' Example data
Dim data As New List(Of String)()
data.Add("This is a sample sentence.")
data.Add("Another sentence follows.")
' Create a dictionary vocabulary
Dim dictionaryVocabulary As HashSet(Of String) = VocabularyGenerator.CreateDictionaryVocabulary(data)
' Create a frequency vocabulary
Dim frequencyVocabulary As Dictionary(Of String, Integer) = VocabularyGenerator.CreateFrequencyVocabulary(data)
' Create a punctuation vocabulary
Dim punctuationVocabulary As HashSet(Of String) = VocabularyGenerator.CreatePunctuationVocabulary(data)
' Export vocabulary to a file
VocabularyGenerator.ExportVocabulary("dictionary_vocabulary.txt", dictionaryVocabulary)
' Import vocabulary from a file
Dim importedVocabulary As HashSet(Of String) = VocabularyGenerator.ImportVocabularyFromFile("dictionary_vocabulary.txt")
' Export frequency vocabulary to a file
VocabularyGenerator.ExportFrequencyVocabularyToFile(frequencyVocabulary, "frequency_vocabulary.txt")
' Import frequency vocabulary from a file
Dim importedFrequencyVocabulary As Dictionary(Of String, Integer) = VocabularyGenerator.ImportFrequencyVocabularyFromFile("frequency_vocabulary.txt")
' Rest of your code...
End Sub
' Usage Example:
Public Sub ExampleWordlistReader()
' Assuming you have a wordlist file named 'words.txt' in the same directory
Dim corpusRoot As String = "."
Dim wordlistPath As String = Path.Combine(corpusRoot, "wordlist.txt")
Dim wordlistReader As New WordListCorpusReader(wordlistPath)
Dim words As List(Of String) = wordlistReader.GetWords()
For Each word As String In words
Console.WriteLine(word)
Next
Console.ReadLine()
' Rest of your code...
End Sub
End Module
End Namespace
Namespace Models
Public Class CorpusCategorizer
Private categoryMap As Dictionary(Of String, List(Of String))
Public Sub New()
categoryMap = New Dictionary(Of String, List(Of String))()
End Sub
Public Sub AddCategory(category As String, keywords As List(Of String))
If Not categoryMap.ContainsKey(category) Then
categoryMap.Add(category, keywords)
Else
categoryMap(category).AddRange(keywords)
End If
End Sub
Public Function CategorizeDocument(document As String) As List(Of String)
Dim categories As New List(Of String)()
For Each category As KeyValuePair(Of String, List(Of String)) In categoryMap
Dim categoryKeywords As List(Of String) = category.Value
For Each keyword As String In categoryKeywords
If document.Contains(keyword) Then
categories.Add(category.Key)
Exit For
End If
Next
Next
Return categories
End Function
End Class
Public Class CorpusCreator
Public maxSequenceLength As Integer = 0
Public Vocabulary As New List(Of String)
Public Sub New(vocabulary As List(Of String), maxSeqLength As Integer)
If vocabulary Is Nothing Then
Throw New ArgumentNullException(NameOf(vocabulary))
End If
Me.Vocabulary = vocabulary
Me.maxSequenceLength = maxSeqLength
End Sub
''' <summary>
''' Generates a classification dataset by labeling text data with classes.
''' </summary>
''' <param name="data">The list of processed text data chunks.</param>
''' <param name="classes">The list of class labels.</param>
''' <returns>A list of input-output pairs for classification.</returns>
Public Shared Function GenerateClassificationDataset(data As List(Of String), classes As List(Of String)) As List(Of Tuple(Of String, String))
Dim dataset As New List(Of Tuple(Of String, String))
For Each chunk As String In data
For Each [class] As String In classes
If IsTermPresent([class], chunk) Then
dataset.Add(Tuple.Create(chunk, [class]))
Exit For
End If
Next
Next
Return dataset
End Function
''' <summary>
''' Creates a predictive dataset for training machine learning models.
''' </summary>
''' <param name="data">The list of processed text data chunks.</param>
''' <param name="windowSize">The size of the input window for predictive modeling.</param>
''' <returns>A list of input-output pairs for predictive modeling.</returns>
Public Shared Function GeneratePredictiveDataset(data As List(Of String), windowSize As Integer) As List(Of String())
Dim dataset As New List(Of String())
For Each chunk As String In data
Dim words As String() = chunk.Split({" "}, StringSplitOptions.RemoveEmptyEntries)
For i As Integer = 0 To words.Length - windowSize
Dim inputWords As String() = words.Skip(i).Take(windowSize).ToArray()
Dim targetWord As String = words(i + windowSize)
dataset.Add(New String() {String.Join(" ", inputWords), targetWord})
Next
Next
Return dataset
End Function
Public Shared Function GenerateTransformerBatches(data As List(Of String), batch_size As Integer, seq_length As Integer) As List(Of Tuple(Of List(Of String), List(Of String)))
Dim batches As New List(Of Tuple(Of List(Of String), List(Of String)))
For i As Integer = 0 To data.Count - batch_size Step batch_size
Dim batchInputs As New List(Of String)
Dim batchTargets As New List(Of String)
For j As Integer = i To i + batch_size - 1
Dim words As String() = data(j).Split({" "}, StringSplitOptions.RemoveEmptyEntries)
If words.Length > seq_length Then
batchInputs.Add(String.Join(" ", words.Take(seq_length)))
batchTargets.Add(String.Join(" ", words.Skip(1).Take(seq_length)))
End If
Next
If batchInputs.Count > 0 Then
batches.Add(Tuple.Create(batchInputs, batchTargets))
End If
Next
Return batches
End Function
''' <summary>
''' Checks if a specific term (entity or keyword) is present in the processed text data.
''' </summary>
''' <param name="term">The term to check.</param>
''' <param name="data">The processed text data.</param>
''' <returns>True if the term is present; otherwise, false.</returns>
Public Shared Function IsTermPresent(term As String, data As String) As Boolean
Return data.ToLower().Contains(term.ToLower())
End Function
Public Function CreateClassificationDataset(data As List(Of String), classes As List(Of String)) As List(Of Tuple(Of String, String))
Dim dataset As New List(Of Tuple(Of String, String))
For Each chunk As String In data
For Each iclass As String In classes
If IsTermPresent(iclass, chunk) Then
dataset.Add(Tuple.Create(chunk, iclass))
Exit For
End If
Next
Next
Return dataset
End Function
''' <summary>
''' Creates batches of data for training.
''' </summary>
''' <param name="Corpus">The training data as a list of string sequences.</param>
''' <param name="batchSize">The size of each batch.</param>
Public Sub CreateData(ByRef Corpus As List(Of List(Of String)), ByRef batchSize As Integer)
For batchStart As Integer = 0 To Corpus.Count - 1 Step batchSize
Dim batchEnd As Integer = Math.Min(batchStart + batchSize - 1, Corpus.Count - 1)
Dim batchInputs As List(Of List(Of Integer)) = GetBatchInputs(Corpus, batchStart, batchEnd)
Dim batchTargets As List(Of List(Of Integer)) = GetBatchTargets(Corpus, batchStart, batchEnd)
' Perform further operations on the batches
Next
End Sub
Public Function CreatePredictiveDataset(data As List(Of String), windowSize As Integer) As List(Of String())
Dim dataset As New List(Of String())
For Each chunk As String In data
Dim words As String() = chunk.Split({" "}, StringSplitOptions.RemoveEmptyEntries)
For i As Integer = 0 To words.Length - windowSize
Dim inputWords As String() = words.Skip(i).Take(windowSize).ToArray()
Dim targetWord As String = words(i + windowSize)
dataset.Add(New String() {String.Join(" ", inputWords), targetWord})
Next
Next
Return dataset
End Function
''' <summary>
''' Converts a batch of data from a list of string sequences to a list of integer sequences.
''' </summary>
''' <param name="data">The input data as a list of string sequences.</param>
''' <param name="startIndex">The starting index of the batch.</param>
''' <param name="endIndex">The ending index of the batch.</param>
''' <returns>A list of integer sequences representing the batch inputs.</returns>
Public Function GetBatchInputs(data As List(Of List(Of String)),
startIndex As Integer,
endIndex As Integer) As List(Of List(Of Integer))
Dim batchInputs As New List(Of List(Of Integer))
For i As Integer = startIndex To endIndex
Dim sequence As List(Of String) = data(i)
' Convert words to corresponding indices
Dim indices As List(Of Integer) = ConvertWordsToIndices(sequence)
' Pad or truncate sequence to the maximum length
indices = PadOrTruncateSequence(indices, maxSequenceLength)
' Add the sequence to the batch
batchInputs.Add(indices)
Next
Return batchInputs
End Function
''' <summary>
''' Converts a batch of data from a list of string sequences to a list of integer sequences as targets.
''' </summary>
''' <param name="data">The input data as a list of string sequences.</param>
''' <param name="startIndex">The starting index of the batch.</param>
''' <param name="endIndex">The ending index of the batch.</param>
''' <returns>A list of integer sequences representing the batch targets.</returns>
Public Function GetBatchTargets(data As List(Of List(Of String)), startIndex As Integer, endIndex As Integer) As List(Of List(Of Integer))
Dim batchTargets As New List(Of List(Of Integer))
For i As Integer = startIndex To endIndex
Dim sequence As List(Of String) = data(i)
' Convert words to corresponding indices
Dim indices As List(Of Integer) = ConvertWordsToIndices(sequence)
' Shift the sequence to get the target sequence
Dim targetIndices As List(Of Integer) = ShiftSequence(indices)
' Pad or truncate sequence to the maximum length
targetIndices = PadOrTruncateSequence(targetIndices, maxSequenceLength)
' Add the target sequence to the batch
batchTargets.Add(targetIndices)
Next
Return batchTargets
End Function
''' <summary>
''' Pads or truncates a sequence to a specified length.
''' </summary>
''' <param name="sequence">The input sequence.</param>
''' <param name="length">The desired length.</param>
''' <returns>The padded or truncated sequence.</returns>
Public Function PadOrTruncateSequence(sequence As List(Of Integer), length As Integer) As List(Of Integer)
If sequence.Count < length Then
' Pad the sequence with a special padding token
sequence.AddRange(Enumerable.Repeat(Vocabulary.IndexOf("PAD"), length - sequence.Count))
ElseIf sequence.Count > length Then
' Truncate the sequence to the desired length
sequence = sequence.GetRange(0, length)
End If
Return sequence
End Function
''' <summary>
''' Shifts a sequence to the right and adds a special token at the beginning.
''' </summary>
''' <param name="sequence">The input sequence.</param>
''' <returns>The shifted sequence.</returns>
Public Function ShiftSequence(sequence As List(Of Integer)) As List(Of Integer)
' Shifts the sequence to the right and adds a special token at the beginning
Dim shiftedSequence As New List(Of Integer) From {Vocabulary.IndexOf("START")}
For i As Integer = 0 To sequence.Count - 1
shiftedSequence.Add(sequence(i))
Next
Return shiftedSequence
End Function
''' <summary>
''' Converts a list of words to a list of corresponding indices based on the vocabulary.
''' </summary>
''' <param name="words">The list of words to convert.</param>
''' <returns>A list of corresponding indices.</returns>
Private Function ConvertWordsToIndices(words As List(Of String)) As List(Of Integer)
Dim indices As New List(Of Integer)
For Each word As String In words
If Vocabulary.Contains(word) Then
indices.Add(Vocabulary.IndexOf(word))
Else
End If
Next
Return indices
End Function
End Class
Public Class EntityLoader
Public EntityList As List(Of Entity)
Public EntityTypes As List(Of String)
Private Random As New Random()
Public Shared Function DetectEntities(chunks As List(Of String), EntityList As List(Of KeyValuePair(Of String, String))) As List(Of KeyValuePair(Of String, String))
' Entity detection logic based on chunks
Dim entityChunks As New List(Of KeyValuePair(Of String, String))
' Example entity detection
For Each chunk As String In chunks
For Each entity In EntityList
If IsTermPresent(entity.Value, chunk, EntityList) Then
entityChunks.Add(entity)
End If
Next
Next
Return entityChunks
End Function
''' <summary>
''' Checks if a specific term (entity or keyword) is present in the processed text data.
''' </summary>
''' <param name="term">The term to check.</param>
''' <param name="data">The processed text data.</param>
''' <returns>True if the term is present; otherwise, false.</returns>
Public Shared Function IsTermPresent(term As String, data As String, EntityList As List(Of KeyValuePair(Of String, String))) As Boolean
Return data.ToLower().Contains(term.ToLower())
End Function
''' <summary>
''' Loads entity information from a file for filtering and labeling.
''' </summary>
''' <param name="filePath">The path to the entity list file (text or JSON).</param>
Public Shared Function LoadEntityListFromFile(filePath As String) As List(Of KeyValuePair(Of String, String))
' Load entity list from file (text or JSON)
Dim fileContent As String = File.ReadAllText(filePath)
Return JsonConvert.DeserializeObject(Of List(Of KeyValuePair(Of String, String)))(fileContent)
End Function
Public Function GenerateNamedEntity() As String
Dim entityType As String = GetRandomEntity()
Dim entityName As String = String.Empty
Dim Words As New List(Of String)
For Each item In EntityList
If item.Type = entityType Then
Words.Add(item.Value)
End If
Next
entityName = GetRandomItem(Words.ToArray)
Return entityName
End Function
Public Function GetRandomContext() As String
Dim entity As String = GenerateNamedEntity()
Dim contextType As String = GetRandomItem(New String() {"before", "after"})
Select Case contextType
Case "before"
Return $"In the context of {entity},"
Case "after"
Return $"Considering {entity},"
Case Else
Return String.Empty
End Select
End Function
Public Function GetRandomEntity() As String
Dim index As Integer = Random.Next(0, EntityTypes.Count)
Return EntityTypes(index)
End Function
Public Function GetRandomItem(items As String()) As String
Dim index As Integer = Random.Next(0, items.Length)
Return items(index)
End Function
End Class
Public Class ModelCorpusReader
Private categoryMap As Dictionary(Of String, List(Of String))
Private corpusFiles As List(Of String)
Private corpusRoot As String
Public Sub New(corpusRootPath As String)
corpusRoot = corpusRootPath
corpusFiles = New List(Of String)()
categoryMap = New Dictionary(Of String, List(Of String))
LoadCorpusFiles()
End Sub
Public Sub AddCategory(category As String, keywords As List(Of String))
If Not categoryMap.ContainsKey(category) Then
categoryMap.Add(category, keywords)
Else
categoryMap(category).AddRange(keywords)
End If
End Sub
Public Function CategorizeDocument(document As String) As List(Of String)
Dim categories As New List(Of String)()
For Each category As KeyValuePair(Of String, List(Of String)) In categoryMap
Dim categoryKeywords As List(Of String) = category.Value
For Each keyword As String In categoryKeywords
If document.Contains(keyword) Then
categories.Add(category.Key)
Exit For
End If
Next
Next
Return categories
End Function
Public Function GetWordsFromWordList(wordListFilePath As String) As List(Of String)
Dim wordList As New List(Of String)()
Using reader As New StreamReader(wordListFilePath)
While Not reader.EndOfStream
Dim line As String = reader.ReadLine()
If Not String.IsNullOrEmpty(line) Then
wordList.Add(line.Trim())
End If
End While
End Using
Return wordList
End Function
Public Function TaggedSentences() As List(Of List(Of Tuple(Of String, String)))
Dim itaggedSentences As New List(Of List(Of Tuple(Of String, String)))()
For Each file As String In corpusFiles
Dim taggedSentencesInFile As New List(Of Tuple(Of String, String))()
Using reader As New StreamReader(file)
While Not reader.EndOfStream
Dim line As String = reader.ReadLine()
Dim wordsTags As String() = line.Split(" ")
For Each wordTag As String In wordsTags
Dim parts As String() = wordTag.Split("/")
If parts.Length = 2 Then
Dim word As String = parts(0)
Dim tag As String = parts(1)
taggedSentencesInFile.Add(New Tuple(Of String, String)(word, tag))
End If
Next
End While
End Using
itaggedSentences.Add(taggedSentencesInFile)
Next
Return itaggedSentences
End Function
Private Sub LoadCorpusFiles()
corpusFiles.Clear()
If Directory.Exists(corpusRoot) Then
corpusFiles.AddRange(Directory.GetFiles(corpusRoot))
End If
End Sub
End Class
Public Class RegexFilter
Public Function FilterUsingRegexPatterns(data As List(Of String), patterns As List(Of String)) As List(Of String)
Dim filteredData As New List(Of String)
For Each chunk As String In data
Dim shouldIncludeChunk As Boolean = True
For Each pattern As String In patterns
Dim regex As New Regex(pattern, RegexOptions.IgnoreCase)
If regex.IsMatch(chunk) Then
shouldIncludeChunk = False
Exit For
End If
Next
If shouldIncludeChunk Then
filteredData.Add(chunk)
End If
Next
Return filteredData
End Function
End Class
Public Class TaggedCorpusReader
Private corpusFiles As List(Of String)
Private corpusRoot As String
Public Sub New(corpusRootPath As String)
corpusRoot = corpusRootPath
corpusFiles = New List(Of String)
LoadCorpusFiles()
End Sub
Public Function TaggedSentences() As List(Of List(Of Tuple(Of String, String)))
Dim itaggedSentences As New List(Of List(Of Tuple(Of String, String)))()
For Each file As String In corpusFiles
Dim taggedSentencesInFile As New List(Of Tuple(Of String, String))()
Using reader As New StreamReader(file)
While Not reader.EndOfStream
Dim line As String = reader.ReadLine()
Dim wordsTags As String() = line.Split(" ")
For Each wordTag As String In wordsTags
Dim parts As String() = wordTag.Split("/")
If parts.Length = 2 Then
Dim word As String = parts(0)
Dim tag As String = parts(1)
taggedSentencesInFile.Add(New Tuple(Of String, String)(word, tag))
End If
Next
End While
End Using
itaggedSentences.Add(taggedSentencesInFile)
Next
Return itaggedSentences
End Function
Private Sub LoadCorpusFiles()
corpusFiles.Clear()
If Directory.Exists(corpusRoot) Then
corpusFiles.AddRange(Directory.GetFiles(corpusRoot))
End If
End Sub
End Class
Public Class WordListCorpusReader
Private wordList As List(Of String)
Public Sub New(filePath As String)
wordList = New List(Of String)()
ReadWordList(filePath)
End Sub
Public Function GetWords() As List(Of String)
Return wordList
End Function
Private Sub ReadWordList(filePath As String)
Using reader As New StreamReader(filePath)
While Not reader.EndOfStream
Dim line As String = reader.ReadLine()
If Not String.IsNullOrEmpty(line) Then
wordList.Add(line.Trim())
End If
End While
End Using
End Sub
End Class
End Namespace
Namespace Utilitys
Public Interface ICorpusChunker
Function FilterUsingPunctuationVocabulary(data As List(Of String)) As List(Of String)
Function GenerateClassificationDataset(data As List(Of String), classes As List(Of String)) As List(Of Tuple(Of String, String))
Function GeneratePredictiveDataset(data As List(Of String), windowSize As Integer) As List(Of String())
Function ProcessTextData(rawData As String, useFiltering As Boolean) As List(Of String)
End Interface
Public Class ChunkProcessor
Private chunkType As ChunkType
Private maxSize As Integer
Public Sub New(chunkType As ChunkType, Optional maxSize As Integer = 0)
Me.chunkType = chunkType
Me.maxSize = maxSize
End Sub
Public Shared Function ApplyPadding(chunks As List(Of String), ByRef maxsize As Integer) As List(Of String)
' Padding logic for text data chunks
Dim paddedChunks As New List(Of String)
For Each chunk As String In chunks
If chunk.Length > maxsize Then
' Apply padding if chunk size exceeds maxSize
paddedChunks.Add(chunk.Substring(0, maxsize))
Else
paddedChunks.Add(chunk)
End If
Next
Return paddedChunks
End Function
Public Shared Function Chunk(data As String, chunkType As ChunkType, ByRef maxsize As Integer) As List(Of String)
' Chunking logic for text data based on chunkType
Dim chunks As New List(Of String)
Select Case chunkType
Case ChunkType.Sentence
' Split into sentences
chunks.AddRange(data.Split("."c))
Case ChunkType.Paragraph
' Split into paragraphs
chunks.AddRange(data.Split(Environment.NewLine))
Case ChunkType.Document
' Treat the whole data as a document
chunks.Add(data)
End Select
If maxsize > 0 Then
' Apply padding based on maxSize
chunks = ApplyPadding(chunks, maxsize)
End If
Return chunks
End Function
Public Shared Sub OutputToCSV(data As List(Of String), outputPath As String)
Using writer As New StreamWriter(outputPath)
For Each chunk As String In data
writer.WriteLine(chunk)
Next
End Using
End Sub
Public Shared Sub OutputToJSON(data As List(Of String), outputPath As String)
Dim jsonData As New List(Of Object)
For Each chunk As String In data
jsonData.Add(New With {.content = chunk})
Next
Dim jsonText As String = JsonConvert.SerializeObject(jsonData, Formatting.Indented)
File.WriteAllText(outputPath, jsonText)
End Sub
Public Shared Sub OutputToListOfLists(data As List(Of String), outputPath As String)
File.WriteAllLines(outputPath, data)
End Sub
Public Shared Sub OutputToStructured(entityChunks As List(Of KeyValuePair(Of String, String)), outputPath As String)
Dim structuredData As New List(Of Object)
For Each entityChunk As KeyValuePair(Of String, String) In entityChunks
structuredData.Add(New With {
.entityType = entityChunk.Key,
.content = entityChunk.Value
})
Next
Dim jsonText As String = JsonConvert.SerializeObject(structuredData, Formatting.Indented)
File.WriteAllText(outputPath, jsonText)
End Sub
Public Shared Function ProcessFile(inputPath As String, outputDirectory As String, entityListfilePath As String, maxSize As Integer, useFiltering As Boolean, chunkType As ChunkType) As List(Of String)
Dim rawData As String = File.ReadAllText(inputPath)
Dim chunks As List(Of String) = Chunk(rawData, chunkType, maxSize)
' Load entity list if filtering is selected
If useFiltering Then
Dim filterList = EntityLoader.LoadEntityListFromFile(entityListfilePath)
' Detect and output structured entities
Dim entityChunks As List(Of KeyValuePair(Of String, String)) = EntityLoader.DetectEntities(chunks, filterList)
OutputToStructured(entityChunks, Path.Combine(outputDirectory, "entity_output.txt"))
End If
If maxSize > 0 Then
' Apply padding based on maxSize
chunks = ApplyPadding(chunks, maxSize)
Else
End If
' Output to different formats
OutputToListOfLists(chunks, Path.Combine(outputDirectory, "output.txt"))
OutputToCSV(chunks, Path.Combine(outputDirectory, "output.csv"))
OutputToJSON(chunks, Path.Combine(outputDirectory, "output.json"))
' Create punctuation vocabulary
Return chunks
End Function
Public Function ApplyFiltering(chunks As List(Of String), filterList As List(Of KeyValuePair(Of String, String))) As List(Of String)
Dim filteredChunks As New List(Of String)
For Each chunk As String In chunks
For Each filterItem As KeyValuePair(Of String, String) In filterList
If chunk.Contains(filterItem.Value) Then
filteredChunks.Add(chunk)
Exit For
End If
Next
Next
Return filteredChunks
End Function
Public Function ApplyPadding(chunks As List(Of String)) As List(Of String)
' Padding logic for text data chunks
Dim paddedChunks As New List(Of String)
For Each chunk As String In chunks
If chunk.Length > maxSize Then
' Apply padding if chunk size exceeds maxSize
paddedChunks.Add(chunk.Substring(0, maxSize))
Else
paddedChunks.Add(chunk)
End If
Next