i want to split data into train and test sets
where
- all
geneID
appear in are in train & test set - train set has 70% of subjects
subjectCode
and test set 30% ofsubjectCode
i tried group split but that is not what i’m looking for.
from sklearn.model_selection import GroupShuffleSplit
splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 7)
split = splitter.split(filtered_data, groups=filtered_data['geneID'])
train_inds, test_inds = next(split)
train_filtered = filtered_data.iloc[train_inds]
test_filtered = filtered_data.iloc[test_inds]
pls see minimal dataset
import pandas as pd
filtered_data = {'subjectCode': {2: 10, 3: 10, 11859: 10, 11864: 8, 11865: 8, 23779: 8, 35704: 6, 35705: 6, 47611: 6, 47617: 2, 47618: 2, 59512: 11, 59513: 11, 71437: 11, 71443: 0, 71444: 0, 83354: 0, 83359: 5, 83360: 5, 95242: 5, 95248: 4, 95249: 4, 107112: 4, 119017: 9, 119018: 9, 130933: 9, 154735: 7, 154736: 7, 166593: 7, 178499: 1, 178500: 1, 190105: 1}, 'geneID': {2: 'ENSMUSG00000033813.15', 3: 'ENSMUSG00000033793.12', 11859: 'ENSMUSG00000025742.5', 11864: 'ENSMUSG00000033813.15', 11865: 'ENSMUSG00000033793.12', 23779: 'ENSMUSG00000025742.5', 35704: 'ENSMUSG00000033813.15', 35705: 'ENSMUSG00000033793.12', 47611: 'ENSMUSG00000025742.5', 47617: 'ENSMUSG00000033813.15', 47618: 'ENSMUSG00000033793.12', 59512: 'ENSMUSG00000033813.15', 59513: 'ENSMUSG00000033793.12', 71437: 'ENSMUSG00000025742.5', 71443: 'ENSMUSG00000033813.15', 71444: 'ENSMUSG00000033793.12', 83354: 'ENSMUSG00000025742.5', 83359: 'ENSMUSG00000033813.15', 83360: 'ENSMUSG00000033793.12', 95242: 'ENSMUSG00000025742.5', 95248: 'ENSMUSG00000033813.15', 95249: 'ENSMUSG00000033793.12', 107112: 'ENSMUSG00000025742.5', 119017: 'ENSMUSG00000033813.15', 119018: 'ENSMUSG00000033793.12', 130933: 'ENSMUSG00000025742.5', 154735: 'ENSMUSG00000033813.15', 154736: 'ENSMUSG00000033793.12', 166593: 'ENSMUSG00000025742.5', 178499: 'ENSMUSG00000033813.15', 178500: 'ENSMUSG00000033793.12', 190105: 'ENSMUSG00000025742.5'}, 'geneID_Code': {2: 6587, 3: 6583, 11859: 3652, 11864: 6587, 11865: 6583, 23779: 3652, 35704: 6587, 35705: 6583, 47611: 3652, 47617: 6587, 47618: 6583, 59512: 6587, 59513: 6583, 71437: 3652, 71443: 6587, 71444: 6583, 83354: 3652, 83359: 6587, 83360: 6583, 95242: 3652, 95248: 6587, 95249: 6583, 107112: 3652, 119017: 6587, 119018: 6583, 130933: 3652, 154735: 6587, 154736: 6583, 166593: 3652, 178499: 6587, 178500: 6583, 190105: 3652}, 'Xvalue': {2: 3.88395698742412, 3: 2.05923301062732, 11859: 2.18095733240984, 11864: 4.04914034313395, 11865: 2.736458124924, 23779: 3.76816531815738, 35704: 3.50948182323722, 35705: 2.9913284618719, 47611: 3.82881264945442, 47617: 3.67332591260917, 47618: 2.66498138105844, 59512: 3.58955181172051, 59513: 2.73279246814772, 71437: 4.36912415325472, 71443: 4.06382889155705, 71444: 2.61484311068492, 83354: 0.650022421648354, 83359: 3.46644572399036, 83360: 2.38538505430198, 95242: 2.99625950234416, 95248: 3.47440810904513, 95249: 2.94868842067454, 107112: 2.71929452566698, 119017: 3.55714551934005, 119018: 2.09597356249813, 130933: 2.83503835901308, 154735: 3.96433825430774, 154736: 2.49400521869121, 166593: 1.0, 178499: 3.72262213818013, 178500: 0.985238299031992, 190105: 3.24430821225586}, 'Yvalue': {2: 5.20523719689807, 3: 3.6026169543471, 11859: 2.87776815450389, 11864: 4.82884888595286, 11865: 3.42698935213114, 23779: 2.77310589779823, 35704: 5.18686146927044, 35705: 3.77074367730028, 47611: 2.74122297652147, 47617: 5.62853652607689, 47618: 3.52067181833994, 59512: 5.16205775433583, 59513: 3.72770522923407, 71437: 2.87090413519797, 71443: 5.58387525491689, 71444: 3.58391034317736, 83354: 3.00900276548362, 83359: 5.30967133521668, 83360: 3.60013356251218, 95242: 3.10077823862, 95248: 5.16847412611904, 95249: 3.51383944557186, 107112: 3.11916894987761, 119017: 5.20480992444776, 119018: 3.49236338473622, 130933: 2.92407954680056, 154735: 5.0540039723563, 154736: 3.70448883056962, 166593: 2.84703510355552, 178499: 5.65992917233545, 178500: 3.55288538571079, 190105: 2.88902771083881}}