@@ -36,7 +36,7 @@ public class AudioOps(
3636 /* *
3737 * Get the parent [KotlinOps] object.
3838 */
39- public val ops : KotlinOps ,
39+ public val ops : KotlinOps
4040) {
4141 public val java: org.tensorflow.op.AudioOps = ops.java.audio
4242
@@ -47,29 +47,24 @@ public class AudioOps(
4747
4848 /* *
4949 * Produces a visualization of audio data over time.
50- *
5150 * Spectrograms are a standard way of representing audio information as a series of
5251 * slices of frequency information, one slice for each window of time. By joining
5352 * these together into a sequence, they form a distinctive fingerprint of the sound
5453 * over time.
55- *
5654 * This op expects to receive audio data as an input, stored as floats in the range
5755 * -1 to 1, together with a window width in samples, and a stride specifying how
5856 * far to move the window between slices. From this it generates a three
5957 * dimensional output. The first dimension is for the channels in the input, so a
6058 * stereo audio input would have two here for example. The second dimension is time,
6159 * with successive frequency slices. The third dimension has an amplitude value for
6260 * each frequency during that time slice.
63- *
6461 * This means the layout when converted and saved as an image is rotated 90 degrees
6562 * clockwise from a typical spectrogram. Time is descending down the Y axis, and
6663 * the frequency decreases from left to right.
67- *
6864 * Each value in the result represents the square root of the sum of the real and
6965 * imaginary parts of an FFT on the current window of samples. In this way, the
7066 * lowest dimension represents the power of each frequency in the current window,
7167 * and adjacent windows are concatenated in the next dimension.
72- *
7368 * To get a more intuitive and visual look at what this operation does, you can run
7469 * tensorflow/examples/wav_to_spectrogram to read in an audio file and save out the
7570 * resulting spectrogram as a PNG image.
@@ -78,17 +73,20 @@ public class AudioOps(
7873 * @param windowSize How wide the input window is in samples. For the highest efficiency
7974 * this should be a power of two, but other values are accepted.
8075 * @param stride How widely apart the center of adjacent sample windows should be.
81- * @param options carries optional attributes values
76+ * @param options carries optional attribute values
8277 * @return a new instance of AudioSpectrogram
8378 * @see org.tensorflow.op.AudioOps.audioSpectrogram
79+ * @param magnitudeSquared Sets the magnitudeSquared option.
80+ *
8481 * @param magnitudeSquared Whether to return the squared magnitude or just the
8582 * magnitude. Using squared magnitude can avoid extra calculations.
83+ * @return this Options instance.
8684 */
8785 public fun audioSpectrogram (
8886 input : Operand <TFloat32 >,
8987 windowSize : Long ,
9088 stride : Long ,
91- magnitudeSquared : Boolean? = null,
89+ magnitudeSquared : Boolean? = null
9290 ): AudioSpectrogram = java.audioSpectrogram(
9391 input,
9492 windowSize,
@@ -100,33 +98,35 @@ public class AudioOps(
10098
10199 /* *
102100 * Decode a 16-bit PCM WAV file to a float tensor.
103- *
104101 * The -32768 to 32767 signed 16-bit values will be scaled to -1.0 to 1.0 in float.
105- *
106102 * When desired_channels is set, if the input contains fewer channels than this
107103 * then the last channel will be duplicated to give the requested number, else if
108104 * the input has more channels than requested then the additional channels will be
109105 * ignored.
110- *
111106 * If desired_samples is set, then the audio will be cropped or padded with zeroes
112107 * to the requested length.
113- *
114108 * The first output contains a Tensor with the content of the audio samples. The
115109 * lowest dimension will be the number of channels, and the second will be the
116110 * number of samples. For example, a ten-sample-long stereo WAV file should give an
117111 * output shape of [10, 2].
118112 *
119113 * @param contents The WAV-encoded audio, usually from a file.
120- * @param options carries optional attributes values
114+ * @param options carries optional attribute values
121115 * @return a new instance of DecodeWav
122116 * @see org.tensorflow.op.AudioOps.decodeWav
117+ * @param desiredChannels Sets the desiredChannels option.
118+ *
123119 * @param desiredChannels Number of sample channels wanted.
120+ * @return this Options instance.
121+ * @param desiredSamples Sets the desiredSamples option.
122+ *
124123 * @param desiredSamples Length of audio requested.
124+ * @return this Options instance.
125125 */
126126 public fun decodeWav (
127127 contents : Operand <TString >,
128128 desiredChannels : Long? = null,
129- desiredSamples : Long? = null,
129+ desiredSamples : Long? = null
130130 ): DecodeWav = java.decodeWav(
131131 contents,
132132 * listOfNotNull(
@@ -137,16 +137,14 @@ public class AudioOps(
137137
138138 /* *
139139 * Encode audio data using the WAV file format.
140- *
141140 * This operation will generate a string suitable to be saved out to create a .wav
142141 * audio file. It will be encoded in the 16-bit PCM format. It takes in float
143142 * values in the range -1.0f to 1.0f, and any outside that value will be clamped to
144143 * that range.
144+ * ``` audio``` is a 2-D float Tensor of shape ``` [length, channels]```.
145+ * ``` sample_rate``` is a scalar Tensor holding the rate to use (e.g. 44100).
145146 *
146- * `audio` is a 2-D float Tensor of shape `[length, channels]`.
147- * `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
148- *
149- * @param audio 2-D with shape `[length, channels]`.
147+ * @param audio 2-D with shape ` [length, channels]`.
150148 * @param sampleRate Scalar containing the sample frequency.
151149 * @return a new instance of EncodeWav
152150 * @see org.tensorflow.op.AudioOps.encodeWav
@@ -159,7 +157,6 @@ public class AudioOps(
159157
160158 /* *
161159 * Transforms a spectrogram into a form that's useful for speech recognition.
162- *
163160 * Mel Frequency Cepstral Coefficients are a way of representing audio data that's
164161 * been effective as an input feature for machine learning. They are created by
165162 * taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
@@ -171,23 +168,35 @@ public class AudioOps(
171168 * @param spectrogram Typically produced by the Spectrogram op, with magnitude_squared
172169 * set to true.
173170 * @param sampleRate How many samples per second the source audio used.
174- * @param options carries optional attributes values
171+ * @param options carries optional attribute values
175172 * @return a new instance of Mfcc
176173 * @see org.tensorflow.op.AudioOps.mfcc
174+ * @param upperFrequencyLimit Sets the upperFrequencyLimit option.
175+ *
177176 * @param upperFrequencyLimit The highest frequency to use when calculating the
178177 * ceptstrum.
178+ * @return this Options instance.
179+ * @param lowerFrequencyLimit Sets the lowerFrequencyLimit option.
180+ *
179181 * @param lowerFrequencyLimit The lowest frequency to use when calculating the
180182 * ceptstrum.
183+ * @return this Options instance.
184+ * @param filterbankChannelCount Sets the filterbankChannelCount option.
185+ *
181186 * @param filterbankChannelCount Resolution of the Mel bank used internally.
187+ * @return this Options instance.
188+ * @param dctCoefficientCount Sets the dctCoefficientCount option.
189+ *
182190 * @param dctCoefficientCount How many output channels to produce per time slice.
191+ * @return this Options instance.
183192 */
184193 public fun mfcc (
185194 spectrogram : Operand <TFloat32 >,
186195 sampleRate : Operand <TInt32 >,
187196 upperFrequencyLimit : Float? = null,
188197 lowerFrequencyLimit : Float? = null,
189198 filterbankChannelCount : Long? = null,
190- dctCoefficientCount : Long? = null,
199+ dctCoefficientCount : Long? = null
191200 ): Mfcc = java.mfcc(
192201 spectrogram,
193202 sampleRate,
0 commit comments