1

I have a pandas dataframe that looks like

import pandas as pd

data = {
  "Race_ID": [1,1,1,2,2,2,2,2,3,3,3,4,4,5,5,5,5,5,5],
  "Student_ID": [3,5,4,1,2,3,4,5,4,3,7,2,3,9,10,2,3,6,5],
  "theta": [3,4,6,8,9,2,12,4,9,0,6,5,2,5,30,3,2,1,50]
}

df = pd.DataFrame(data)

and I would like to create a new column df['feature'] by the following method: with each Race_ID, suppose the Student_ID is equal to $i$, then we define feature to be

$$\sum_{k=\not= i}\sum_{j\not= i,k} f(k,j,i), \ \ f(k,j,i):=\frac{\theta_j+\theta_i}{\theta_i+\theta_j+\theta_k\cdot \theta_i}$$,

where $k,j$ are the Student_IDs within the same Race_ID and $theta_i$ is theta with Student_ID equals to i. So for example for Race_ID $=1$, we have

feature for Student_ID $= 3$: $f(5,4,3)+f(4,5,3)=124/175=0.708577$

feature for Student_ID $= 5$: $f(3,4,5)+f(4,3,5)=232/341=0.680352$

feature for Student_ID $= 4$: $f(3,5,4)+f(5,3,4)=97/154=0.629870$

Here is what I have tried but it doesn't seem to work and also seems to be very slow when the dataframe is large:

def compute_sum(row, df):
  Race_ID = row['Race_ID']
  n_RaceID = df.groupby('Race_ID')['Student_ID'].nunique()[Race_ID]
  theta_i_values = df[(df['Race_ID'] == Race_ID) & (df['Student_ID'] == row['Student_ID'])]['theta'].values
  theta_values = df[(df['Race_ID'] == Race_ID) & (df['Student_ID'] != row['Student_ID'])]['theta'].values
  sum_ = sum([((theta_values[j] + theta_i_values[i]) / (theta_i_values[i] + theta_values[j] + theta_values[k] * theta_i_values[i])) for i in range(len(theta_i_values)) for k in range(len(theta_values)) if k != i for j in range(len(theta_values)) if j != i and j != k])
  return sum_

df['feature'] = df.apply(lambda row: compute_sum(row, df), axis=1)
1
  • 1
    Please double check your computations, they seem incorrect for students 5 and 4. One major issue in your approach is that you can have the same student id across different race id and they have different theta.
    – mozway
    Commented Jul 10 at 7:54

2 Answers 2

2

One option could be to create a function to operate per group with groupby.apply, a set of the student ids and itertools.permutations:

from itertools import permutations

def f(tk, tj, ti):
    return (tj+ti)/(tj+ti+tk*ti)

def compute(g):
    thetas = dict(zip(g['Student_ID'], g['theta']))
    ids = set(g['Student_ID'])
    
    return pd.Series((sum(f(thetas[k], thetas[j], thetas[i])
                             for (k,j) in permutations(S, 2))
                      for i in g['Student_ID'] for S in [ids-{i}]),
                     index=g.index)

df['feature'] = (df.groupby('Race_ID', group_keys=False)
                   .apply(compute, include_groups=False)
                )

Output:

    Race_ID  Student_ID  theta    feature
0         1           3      3   0.708571
1         1           5      4   0.680352
2         1           4      6   0.629870
3         2           1      8   3.307021
4         2           2      9   3.213902
5         2           3      2   4.770714
6         2           4     12   2.995137
7         2           5      4   3.890607
8         3           4      9   1.142857
9         3           3      0   2.000000
10        3           7      6   1.100000
11        4           2      5   0.000000
12        4           3      2   0.000000
13        5           9      5   8.129834
14        5          10     30   6.170815
15        5           2      3   8.695333
16        5           3      2   9.249267
17        5           6      1  10.557860
18        5           5     50   5.669121
1
  • 1
    thank you so much! Just checked my computation, it was wrong, updated, many thanks for your help!
    – Ishigami
    Commented Jul 10 at 8:06
1

My Solution

import pandas as pd

# Given DataFrame
data = {
  "Race_ID": [1,1,1,2,2,2,2,2,3,3,3,4,4,5,5,5,5,5,5],
  "Student_ID": [3,5,4,1,2,3,4,5,4,3,7,2,3,9,10,2,3,6,5],
  "theta": [3,4,6,8,9,2,12,4,9,0,6,5,2,5,30,3,2,1,50]
}

df = pd.DataFrame(data)

# Function to calculate the feature
def calculate_feature(row):
    race_id = row['Race_ID']
    student_id = row['Student_ID']
    theta_i = row['theta']
    
    race_data = df[df['Race_ID'] == race_id]
    feature_sum = 0
    
    for index, data in race_data.iterrows():
        if data['Student_ID'] != student_id:
            theta_j = data['theta']
            for index2, data2 in race_data.iterrows():
                if data2['Student_ID'] != student_id and data2['Student_ID'] != data['Student_ID']:
                    theta_k = data2['theta']
                    feature_sum += (theta_j + theta_i) / (theta_i + theta_j + (theta_k * theta_i))
    
    return feature_sum

# Applying the function to create the new column 'feature'
df['feature'] = df.apply(calculate_feature, axis=1)

# Displaying the updated DataFrame
print(df)

Not the answer you're looking for? Browse other questions tagged or ask your own question.